mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
RIP CUDA <9.2: circleci, aten, and caffe2 (#36846)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/36846 Test Plan: Imported from OSS Differential Revision: D21620850 Pulled By: ngimel fbshipit-source-id: 7ad1676a12f86250f301095ffc6f365a3b370f34
This commit is contained in:
parent
b29e7f9b9d
commit
5e2d8745c8
12 changed files with 20 additions and 118 deletions
|
|
@ -76,14 +76,6 @@ case "$image" in
|
|||
DB=yes
|
||||
VISION=yes
|
||||
;;
|
||||
pytorch-linux-xenial-cuda9-cudnn7-py3)
|
||||
CUDA_VERSION=9.0
|
||||
CUDNN_VERSION=7
|
||||
ANACONDA_PYTHON_VERSION=3.6
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
;;
|
||||
pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4)
|
||||
CUDA_VERSION=9.2
|
||||
CUDNN_VERSION=7
|
||||
|
|
|
|||
|
|
@ -65,11 +65,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
|||
# DO NOT install cmake here as it would install a version newer than 3.5, but
|
||||
# we want to pin to version 3.5.
|
||||
conda_install numpy pyyaml mkl mkl-include setuptools cffi typing future six
|
||||
if [[ "$CUDA_VERSION" == 9.0* ]]; then
|
||||
conda_install magma-cuda90 -c pytorch
|
||||
elif [[ "$CUDA_VERSION" == 9.1* ]]; then
|
||||
conda_install magma-cuda91 -c pytorch
|
||||
elif [[ "$CUDA_VERSION" == 9.2* ]]; then
|
||||
if [[ "$CUDA_VERSION" == 9.2* ]]; then
|
||||
conda_install magma-cuda92 -c pytorch
|
||||
elif [[ "$CUDA_VERSION" == 10.0* ]]; then
|
||||
conda_install magma-cuda100 -c pytorch
|
||||
|
|
|
|||
|
|
@ -72,26 +72,6 @@ Tensor prepare_matrix_for_cublas(Tensor& tensor, bool& transpose_tensor) {
|
|||
return tensor_;
|
||||
}
|
||||
|
||||
// Check https://github.com/pytorch/pytorch/issues/22078
|
||||
// for information about the bug. We don't know the exact conditions that trigger it,
|
||||
// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a
|
||||
// necessary condition.
|
||||
static void checkCuda90Bug(int i_m, int i_n, int i_k)
|
||||
{
|
||||
#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000
|
||||
static std::once_flag alreadyWarned;
|
||||
const int LIMIT = 1 << 21;
|
||||
if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) {
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
if (prop->major == 5 || prop->major == 6) {
|
||||
std::call_once(alreadyWarned, []() {
|
||||
TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later.");
|
||||
});
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
|
||||
TORCH_CHECK(
|
||||
(mat1.dim() == 2) && (mat2.dim() == 2) &&
|
||||
|
|
@ -143,9 +123,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
|||
at::ScalarType scalar_type = self.scalar_type();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
|
||||
if (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::Float) {
|
||||
checkCuda90Bug(static_cast<int>(m), static_cast<int>(n), static_cast<int>(k));
|
||||
}
|
||||
scalar_t alpha_val = alpha.to<scalar_t>();
|
||||
scalar_t beta_val = beta.to<scalar_t>();
|
||||
scalar_t* mat1_ptr = mat1_.data_ptr<scalar_t>();
|
||||
|
|
|
|||
|
|
@ -578,9 +578,6 @@ namespace {
|
|||
}
|
||||
|
||||
cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){
|
||||
#if CUDNN_VERSION < 7200 || CUDA_VERSION < 9010
|
||||
return CUDNN_RNN_ALGO_STANDARD;
|
||||
#else
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
const int64_t bsize = tensors.mini_batch;
|
||||
//excluding Turing from using persistent rnn.
|
||||
|
|
@ -599,17 +596,12 @@ namespace {
|
|||
}
|
||||
}
|
||||
return CUDNN_RNN_ALGO_STANDARD;
|
||||
#endif
|
||||
}
|
||||
|
||||
cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
|
||||
#if CUDNN_VERSION != 7103
|
||||
// CUDNN 7.1.3 enforces RNN descriptor type to be identical to input/weight. This check throws an error for type
|
||||
// promotion. The check has since been removed.
|
||||
if (dtype == CUDNN_DATA_HALF) {
|
||||
return CUDNN_DATA_FLOAT;
|
||||
}
|
||||
#endif
|
||||
return dtype;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -215,15 +215,10 @@ bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
|
|||
}
|
||||
|
||||
bool TensorCoreAvailable() {
|
||||
// requires CUDA 9.0 and above
|
||||
#if CUDA_VERSION < 9000
|
||||
return false;
|
||||
#else
|
||||
int device = CaffeCudaGetDevice();
|
||||
auto& prop = GetDeviceProperty(device);
|
||||
|
||||
return prop.major >= 7;
|
||||
#endif
|
||||
}
|
||||
|
||||
const char* cublasGetErrorString(cublasStatus_t error) {
|
||||
|
|
@ -238,23 +233,18 @@ const char* cublasGetErrorString(cublasStatus_t error) {
|
|||
return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH:
|
||||
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR:
|
||||
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
case CUBLAS_STATUS_MAPPING_ERROR:
|
||||
return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED:
|
||||
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
#endif
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR:
|
||||
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
#if CUDA_VERSION >= 6000
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED:
|
||||
return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||
#if CUDA_VERSION >= 6050
|
||||
case CUBLAS_STATUS_LICENSE_ERROR:
|
||||
return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||
#endif // CUDA_VERSION >= 6050
|
||||
#endif // CUDA_VERSION >= 6000
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#else
|
||||
case rocblas_status_invalid_size:
|
||||
return "rocblas_status_invalid_size";
|
||||
case rocblas_status_perf_degraded:
|
||||
|
|
|
|||
|
|
@ -5,18 +5,14 @@
|
|||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// Disable strict aliasing errors for CUDA 9.
|
||||
// The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic.
|
||||
// It is included by cusparse.h as well, so guarding the
|
||||
// inclusion of that header here is not enough.
|
||||
#if CUDA_VERSION >= 9000
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
#ifdef __GNUC__
|
||||
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
|
||||
#pragma GCC diagnostic push
|
||||
#endif
|
||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
|
||||
#endif // __GNUC__
|
||||
#endif // CUDA_VERSION >= 9000
|
||||
#endif // __HIP_PLATFORM_HCC__
|
||||
|
||||
#include <cublas_v2.h>
|
||||
#include <curand.h>
|
||||
|
|
@ -42,9 +38,7 @@
|
|||
// CAFFE_HAS_CUDA_FP16 manually.
|
||||
|
||||
#ifndef CAFFE_HAS_CUDA_FP16
|
||||
#if CUDA_VERSION >= 7050 || defined(__HIP_PLATFORM_HCC__)
|
||||
#define CAFFE_HAS_CUDA_FP16
|
||||
#endif // CUDA_VERSION >= 7050
|
||||
#endif // CAFFE_HAS_CUDA_FP16
|
||||
|
||||
#ifdef CAFFE_HAS_CUDA_FP16
|
||||
|
|
@ -59,13 +53,13 @@ constexpr int kFp16CUDADevicePropMajor = 3;
|
|||
#endif
|
||||
|
||||
// Re-enable strict aliasing diagnostic if it was disabled.
|
||||
#if CUDA_VERSION >= 9000
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
#ifdef __GNUC__
|
||||
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
#endif // __GNUC__
|
||||
#endif // CUDA_VERSION >= 9000
|
||||
#endif // __HIP_PLATFORM_HCC__
|
||||
|
||||
/**
|
||||
* The maximum number of peers that each gpu can have when doing p2p setup.
|
||||
|
|
@ -78,12 +72,12 @@ constexpr int kFp16CUDADevicePropMajor = 3;
|
|||
|
||||
namespace caffe2 {
|
||||
|
||||
#if CUDA_VERSION >= 9000
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
/**
|
||||
* Empty class to identify TensorCore-based math
|
||||
*/
|
||||
class TensorCoreEngine {};
|
||||
#endif
|
||||
#endif // __HIP_PLATFORM_HCC__
|
||||
|
||||
#if CUDA_VERSION >= 10000
|
||||
#define CAFFE2_CUDA_PTRATTR_MEMTYPE type
|
||||
|
|
|
|||
|
|
@ -11,7 +11,8 @@ bool BatchMatMulOp<CUDAContext, DefaultEngine>::RunOnDevice() {
|
|||
|
||||
REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp<CUDAContext>);
|
||||
|
||||
#if CUDA_VERSION >= 9000
|
||||
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
|
||||
template <>
|
||||
bool BatchMatMulOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
|
||||
|
|
|
|||
|
|
@ -138,7 +138,8 @@ bool FullyConnectedGradientOp<
|
|||
return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
|
||||
}
|
||||
|
||||
#if CUDA_VERSION >= 9000
|
||||
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
|
||||
// Require these to be defined otherwise TensorCore FC ops will end
|
||||
// up calling the default FC implementation which doesn't have
|
||||
|
|
@ -190,7 +191,8 @@ REGISTER_CUDA_OPERATOR(
|
|||
DefaultEngine,
|
||||
false /* don't transpose weight */>);
|
||||
|
||||
#if CUDA_VERSION >= 9000
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
|
||||
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
|
||||
FC,
|
||||
TENSORCORE,
|
||||
|
|
@ -214,6 +216,7 @@ REGISTER_CUDA_OPERATOR_WITH_ENGINE(
|
|||
CUDAContext,
|
||||
TensorCoreEngine,
|
||||
false /* don't transpose weight */>);
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace caffe2
|
||||
|
|
|
|||
|
|
@ -127,11 +127,7 @@ warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) {
|
|||
int index = __popcll(getLaneMaskLt() & vote);
|
||||
int total = __popcll(vote);
|
||||
#else
|
||||
#if CUDA_VERSION >= 9000
|
||||
unsigned int vote = __ballot_sync(__activemask(), wantInsert);
|
||||
#else
|
||||
unsigned int vote = __ballot(wantInsert);
|
||||
#endif
|
||||
|
||||
if (!vote) {
|
||||
// Everything the warp has is smaller than our heap
|
||||
|
|
|
|||
|
|
@ -171,11 +171,7 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize],
|
|||
#if defined(__HIP_PLATFORM_HCC__)
|
||||
counts[j] += __popcll(__ballot(vote));
|
||||
#else
|
||||
#if CUDA_VERSION >= 9000
|
||||
counts[j] += __popc(__ballot_sync(__activemask(), vote));
|
||||
#else
|
||||
counts[j] += __popc(__ballot(vote));
|
||||
#endif
|
||||
#endif // __HIP_PLATFORM_HCC__
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,12 +68,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti
|
|||
T index = __popcll(getLaneMaskLe() & vote);
|
||||
T carry = __popcll(vote);
|
||||
#else
|
||||
#if CUDA_VERSION >= 9000
|
||||
T vote = __ballot_sync(__activemask(), in);
|
||||
#else
|
||||
T vote = __ballot(in);
|
||||
#endif // CUDA_VERSION
|
||||
|
||||
T index = __popc(getLaneMaskLe() & vote);
|
||||
T carry = __popc(vote);
|
||||
#endif // __HIP_PLATFORM_HCC__
|
||||
|
|
|
|||
|
|
@ -865,24 +865,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
|
|||
const cublasOperation_t cu_trans_B =
|
||||
(trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
||||
if (math_type == TensorProto_DataType_FLOAT) {
|
||||
#if CUDA_VERSION < 9010
|
||||
// loop over matrices in the batch
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
Gemm<at::Half, CUDAContext>(
|
||||
trans_A,
|
||||
trans_B,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
alpha,
|
||||
A[i],
|
||||
B[i],
|
||||
beta,
|
||||
C[i],
|
||||
context,
|
||||
math_type);
|
||||
}
|
||||
#else
|
||||
thrust::device_vector<const void*> A_device(A, A + batch_size);
|
||||
thrust::device_vector<const void*> B_device(B, B + batch_size);
|
||||
thrust::device_vector<void*> C_device(C, C + batch_size);
|
||||
|
|
@ -909,7 +891,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
|
|||
batch_size,
|
||||
CUDA_R_32F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
#endif
|
||||
} else if (math_type == TensorProto_DataType_FLOAT16) {
|
||||
// Convert alpha, beta from float -> __half
|
||||
const __half alpha_fp16 = at::Half(alpha);
|
||||
|
|
@ -989,16 +970,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
|
|||
const cublasOperation_t cu_trans_B =
|
||||
(trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
||||
if (math_type == TensorProto_DataType_FLOAT) {
|
||||
#if CUDA_VERSION < 9010 && !defined(__HIP_PLATFORM_HCC__)
|
||||
// loop over matrices in the batch
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
Gemm<at::Half, CUDAContext>(
|
||||
trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context, math_type);
|
||||
A += A_stride;
|
||||
B += B_stride;
|
||||
C += C_stride;
|
||||
}
|
||||
#else
|
||||
CUBLAS_ENFORCE(cublasSetPointerMode(
|
||||
context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
|
|
@ -1060,7 +1031,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
|
|||
CUDA_R_32F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
#endif // __HIP_PLATFORM_HCC__
|
||||
#endif
|
||||
} else if (math_type == TensorProto_DataType_FLOAT16) {
|
||||
// Convert alpha, beta from float -> __half
|
||||
const __half alpha_fp16 = at::Half(alpha);
|
||||
|
|
@ -1223,7 +1193,7 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext>(
|
|||
}
|
||||
}
|
||||
|
||||
#if CUDA_VERSION >= 9000
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
|
||||
// No change, but required. Defer to default CUDA engine
|
||||
template <>
|
||||
|
|
@ -1473,7 +1443,7 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext, TensorCoreEngine>(
|
|||
trans_A, M, N, alpha, A, x, beta, y, context, math_type);
|
||||
}
|
||||
|
||||
#endif // CUDA_VERSION >= 9000
|
||||
#endif
|
||||
|
||||
template <>
|
||||
CAFFE2_CUDA_EXPORT void GemmEx<float, CUDAContext>(
|
||||
|
|
@ -1692,7 +1662,7 @@ CAFFE2_CUDA_EXPORT void Dot<at::Half, CUDAContext>(
|
|||
CUDAContext* context) {
|
||||
#if defined __HIP_PLATFORM_HCC__ && HIP_VERSION < 210
|
||||
CAFFE_THROW("HIP currently does not support FP16 completely yet.");
|
||||
#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210
|
||||
#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210
|
||||
CUBLAS_ENFORCE(cublasSetPointerMode(
|
||||
context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));
|
||||
CUBLAS_ENFORCE(rocblas_hdot(
|
||||
|
|
|
|||
Loading…
Reference in a new issue