RIP CUDA <9.2: circleci, aten, and caffe2 (#36846)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/36846

Test Plan: Imported from OSS

Differential Revision: D21620850

Pulled By: ngimel

fbshipit-source-id: 7ad1676a12f86250f301095ffc6f365a3b370f34
This commit is contained in:
Xiang Gao 2020-05-18 13:36:16 -07:00 committed by Facebook GitHub Bot
parent b29e7f9b9d
commit 5e2d8745c8
12 changed files with 20 additions and 118 deletions

View file

@ -76,14 +76,6 @@ case "$image" in
DB=yes
VISION=yes
;;
pytorch-linux-xenial-cuda9-cudnn7-py3)
CUDA_VERSION=9.0
CUDNN_VERSION=7
ANACONDA_PYTHON_VERSION=3.6
PROTOBUF=yes
DB=yes
VISION=yes
;;
pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4)
CUDA_VERSION=9.2
CUDNN_VERSION=7

View file

@ -65,11 +65,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
# DO NOT install cmake here as it would install a version newer than 3.5, but
# we want to pin to version 3.5.
conda_install numpy pyyaml mkl mkl-include setuptools cffi typing future six
if [[ "$CUDA_VERSION" == 9.0* ]]; then
conda_install magma-cuda90 -c pytorch
elif [[ "$CUDA_VERSION" == 9.1* ]]; then
conda_install magma-cuda91 -c pytorch
elif [[ "$CUDA_VERSION" == 9.2* ]]; then
if [[ "$CUDA_VERSION" == 9.2* ]]; then
conda_install magma-cuda92 -c pytorch
elif [[ "$CUDA_VERSION" == 10.0* ]]; then
conda_install magma-cuda100 -c pytorch

View file

@ -72,26 +72,6 @@ Tensor prepare_matrix_for_cublas(Tensor& tensor, bool& transpose_tensor) {
return tensor_;
}
// Check https://github.com/pytorch/pytorch/issues/22078
// for information about the bug. We don't know the exact conditions that trigger it,
// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a
// necessary condition.
static void checkCuda90Bug(int i_m, int i_n, int i_k)
{
#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000
static std::once_flag alreadyWarned;
const int LIMIT = 1 << 21;
if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) {
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
if (prop->major == 5 || prop->major == 6) {
std::call_once(alreadyWarned, []() {
TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later.");
});
}
}
#endif
}
Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
TORCH_CHECK(
(mat1.dim() == 2) && (mat2.dim() == 2) &&
@ -143,9 +123,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
at::ScalarType scalar_type = self.scalar_type();
AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
if (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::Float) {
checkCuda90Bug(static_cast<int>(m), static_cast<int>(n), static_cast<int>(k));
}
scalar_t alpha_val = alpha.to<scalar_t>();
scalar_t beta_val = beta.to<scalar_t>();
scalar_t* mat1_ptr = mat1_.data_ptr<scalar_t>();

View file

@ -578,9 +578,6 @@ namespace {
}
cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){
#if CUDNN_VERSION < 7200 || CUDA_VERSION < 9010
return CUDNN_RNN_ALGO_STANDARD;
#else
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
const int64_t bsize = tensors.mini_batch;
//excluding Turing from using persistent rnn.
@ -599,17 +596,12 @@ namespace {
}
}
return CUDNN_RNN_ALGO_STANDARD;
#endif
}
cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
#if CUDNN_VERSION != 7103
// CUDNN 7.1.3 enforces RNN descriptor type to be identical to input/weight. This check throws an error for type
// promotion. The check has since been removed.
if (dtype == CUDNN_DATA_HALF) {
return CUDNN_DATA_FLOAT;
}
#endif
return dtype;
}

View file

@ -215,15 +215,10 @@ bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
}
bool TensorCoreAvailable() {
// requires CUDA 9.0 and above
#if CUDA_VERSION < 9000
return false;
#else
int device = CaffeCudaGetDevice();
auto& prop = GetDeviceProperty(device);
return prop.major >= 7;
#endif
}
const char* cublasGetErrorString(cublasStatus_t error) {
@ -238,23 +233,18 @@ const char* cublasGetErrorString(cublasStatus_t error) {
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
#ifndef __HIP_PLATFORM_HCC__
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
#endif
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
#if CUDA_VERSION >= 6000
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
#if CUDA_VERSION >= 6050
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
#endif // CUDA_VERSION >= 6050
#endif // CUDA_VERSION >= 6000
#ifdef __HIP_PLATFORM_HCC__
#else
case rocblas_status_invalid_size:
return "rocblas_status_invalid_size";
case rocblas_status_perf_degraded:

View file

@ -5,18 +5,14 @@
#include <cuda.h>
#include <cuda_runtime.h>
// Disable strict aliasing errors for CUDA 9.
// The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic.
// It is included by cusparse.h as well, so guarding the
// inclusion of that header here is not enough.
#if CUDA_VERSION >= 9000
#ifndef __HIP_PLATFORM_HCC__
#ifdef __GNUC__
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
#pragma GCC diagnostic push
#endif
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif // __GNUC__
#endif // CUDA_VERSION >= 9000
#endif // __HIP_PLATFORM_HCC__
#include <cublas_v2.h>
#include <curand.h>
@ -42,9 +38,7 @@
// CAFFE_HAS_CUDA_FP16 manually.
#ifndef CAFFE_HAS_CUDA_FP16
#if CUDA_VERSION >= 7050 || defined(__HIP_PLATFORM_HCC__)
#define CAFFE_HAS_CUDA_FP16
#endif // CUDA_VERSION >= 7050
#endif // CAFFE_HAS_CUDA_FP16
#ifdef CAFFE_HAS_CUDA_FP16
@ -59,13 +53,13 @@ constexpr int kFp16CUDADevicePropMajor = 3;
#endif
// Re-enable strict aliasing diagnostic if it was disabled.
#if CUDA_VERSION >= 9000
#ifndef __HIP_PLATFORM_HCC__
#ifdef __GNUC__
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
#pragma GCC diagnostic pop
#endif
#endif // __GNUC__
#endif // CUDA_VERSION >= 9000
#endif // __HIP_PLATFORM_HCC__
/**
* The maximum number of peers that each gpu can have when doing p2p setup.
@ -78,12 +72,12 @@ constexpr int kFp16CUDADevicePropMajor = 3;
namespace caffe2 {
#if CUDA_VERSION >= 9000
#ifndef __HIP_PLATFORM_HCC__
/**
* Empty class to identify TensorCore-based math
*/
class TensorCoreEngine {};
#endif
#endif // __HIP_PLATFORM_HCC__
#if CUDA_VERSION >= 10000
#define CAFFE2_CUDA_PTRATTR_MEMTYPE type

View file

@ -11,7 +11,8 @@ bool BatchMatMulOp<CUDAContext, DefaultEngine>::RunOnDevice() {
REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp<CUDAContext>);
#if CUDA_VERSION >= 9000
#ifndef __HIP_PLATFORM_HCC__
template <>
bool BatchMatMulOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {

View file

@ -138,7 +138,8 @@ bool FullyConnectedGradientOp<
return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
}
#if CUDA_VERSION >= 9000
#ifndef __HIP_PLATFORM_HCC__
// Require these to be defined otherwise TensorCore FC ops will end
// up calling the default FC implementation which doesn't have
@ -190,7 +191,8 @@ REGISTER_CUDA_OPERATOR(
DefaultEngine,
false /* don't transpose weight */>);
#if CUDA_VERSION >= 9000
#ifndef __HIP_PLATFORM_HCC__
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
FC,
TENSORCORE,
@ -214,6 +216,7 @@ REGISTER_CUDA_OPERATOR_WITH_ENGINE(
CUDAContext,
TensorCoreEngine,
false /* don't transpose weight */>);
#endif
} // namespace caffe2

View file

@ -127,11 +127,7 @@ warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) {
int index = __popcll(getLaneMaskLt() & vote);
int total = __popcll(vote);
#else
#if CUDA_VERSION >= 9000
unsigned int vote = __ballot_sync(__activemask(), wantInsert);
#else
unsigned int vote = __ballot(wantInsert);
#endif
if (!vote) {
// Everything the warp has is smaller than our heap

View file

@ -171,11 +171,7 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize],
#if defined(__HIP_PLATFORM_HCC__)
counts[j] += __popcll(__ballot(vote));
#else
#if CUDA_VERSION >= 9000
counts[j] += __popc(__ballot_sync(__activemask(), vote));
#else
counts[j] += __popc(__ballot(vote));
#endif
#endif // __HIP_PLATFORM_HCC__
}
}

View file

@ -68,12 +68,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti
T index = __popcll(getLaneMaskLe() & vote);
T carry = __popcll(vote);
#else
#if CUDA_VERSION >= 9000
T vote = __ballot_sync(__activemask(), in);
#else
T vote = __ballot(in);
#endif // CUDA_VERSION
T index = __popc(getLaneMaskLe() & vote);
T carry = __popc(vote);
#endif // __HIP_PLATFORM_HCC__

View file

@ -865,24 +865,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
const cublasOperation_t cu_trans_B =
(trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
if (math_type == TensorProto_DataType_FLOAT) {
#if CUDA_VERSION < 9010
// loop over matrices in the batch
for (int i = 0; i < batch_size; ++i) {
Gemm<at::Half, CUDAContext>(
trans_A,
trans_B,
M,
N,
K,
alpha,
A[i],
B[i],
beta,
C[i],
context,
math_type);
}
#else
thrust::device_vector<const void*> A_device(A, A + batch_size);
thrust::device_vector<const void*> B_device(B, B + batch_size);
thrust::device_vector<void*> C_device(C, C + batch_size);
@ -909,7 +891,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
batch_size,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
#endif
} else if (math_type == TensorProto_DataType_FLOAT16) {
// Convert alpha, beta from float -> __half
const __half alpha_fp16 = at::Half(alpha);
@ -989,16 +970,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
const cublasOperation_t cu_trans_B =
(trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
if (math_type == TensorProto_DataType_FLOAT) {
#if CUDA_VERSION < 9010 && !defined(__HIP_PLATFORM_HCC__)
// loop over matrices in the batch
for (int i = 0; i < batch_size; ++i) {
Gemm<at::Half, CUDAContext>(
trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context, math_type);
A += A_stride;
B += B_stride;
C += C_stride;
}
#else
CUBLAS_ENFORCE(cublasSetPointerMode(
context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
#ifdef __HIP_PLATFORM_HCC__
@ -1060,7 +1031,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
#endif // __HIP_PLATFORM_HCC__
#endif
} else if (math_type == TensorProto_DataType_FLOAT16) {
// Convert alpha, beta from float -> __half
const __half alpha_fp16 = at::Half(alpha);
@ -1223,7 +1193,7 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext>(
}
}
#if CUDA_VERSION >= 9000
#ifndef __HIP_PLATFORM_HCC__
// No change, but required. Defer to default CUDA engine
template <>
@ -1473,7 +1443,7 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext, TensorCoreEngine>(
trans_A, M, N, alpha, A, x, beta, y, context, math_type);
}
#endif // CUDA_VERSION >= 9000
#endif
template <>
CAFFE2_CUDA_EXPORT void GemmEx<float, CUDAContext>(
@ -1692,7 +1662,7 @@ CAFFE2_CUDA_EXPORT void Dot<at::Half, CUDAContext>(
CUDAContext* context) {
#if defined __HIP_PLATFORM_HCC__ && HIP_VERSION < 210
CAFFE_THROW("HIP currently does not support FP16 completely yet.");
#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210
#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210
CUBLAS_ENFORCE(cublasSetPointerMode(
context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));
CUBLAS_ENFORCE(rocblas_hdot(