mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Cudnn v6
Summary: Add cudnn v6 support, including testing support for dilated convolution. Add a check to ensure that the versions of cuDNN used to compile Caffe2 and run it are compatible Closes https://github.com/caffe2/caffe2/pull/85 Reviewed By: bwasti Differential Revision: D4387690 Pulled By: Yangqing fbshipit-source-id: 312960134398dd4afe6ee0c01cdc160046c904e8
This commit is contained in:
parent
1c92e85dae
commit
fbf47a8825
9 changed files with 90 additions and 7 deletions
|
|
@ -18,6 +18,9 @@ static_assert(
|
|||
CUDNN_VERSION >= 5000,
|
||||
"Caffe2 requires cudnn version 5.0 or above.");
|
||||
|
||||
#define CUDNN_VERSION_MIN(major, minor, patch) \
|
||||
(CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace internal {
|
||||
|
|
@ -70,6 +73,25 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
|
|||
::caffe2::internal::cudnnGetErrorString(status)); \
|
||||
} while (0)
|
||||
|
||||
// report the version of cuDNN Caffe2 was compiled with
|
||||
inline size_t cudnnCompiledVersion() {
|
||||
return CUDNN_VERSION;
|
||||
}
|
||||
// report the runtime version of cuDNN
|
||||
inline size_t cudnnRuntimeVersion() {
|
||||
return cudnnGetVersion();
|
||||
}
|
||||
|
||||
// Check compatibility of compiled and runtime cuDNN versions
|
||||
inline void CheckCuDNNVersions() {
|
||||
// Version format is major*1000 + minor*100 + patch
|
||||
// Major, minor and patch versions must all match
|
||||
bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
|
||||
CAFFE_ENFORCE(version_match,
|
||||
"cuDNN compiled (", cudnnCompiledVersion(), ") and"
|
||||
"runtime (", cudnnRuntimeVersion(), ") versions mismatch");
|
||||
}
|
||||
|
||||
/**
|
||||
* cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type
|
||||
* in a template function. The class is specialized explicitly for different
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#endif // CAFFE2_USE_CNMEM
|
||||
|
||||
#include "caffe2/core/asan.h"
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
|
@ -139,6 +140,9 @@ static void Caffe2InitializeCuda() {
|
|||
TypeMeta::Id<Tensor<CUDAContext>>(),
|
||||
GetTensorShape<CUDAContext>
|
||||
);
|
||||
|
||||
// Check the versions of cuDNN that were compiled and linked with are compatible
|
||||
CheckCuDNNVersions();
|
||||
}
|
||||
|
||||
#ifdef CAFFE2_USE_CNMEM
|
||||
|
|
|
|||
|
|
@ -56,9 +56,12 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
|
|||
pad_l_ == pad_r_,
|
||||
"The current padding scheme leads to unequal padding on the left "
|
||||
"and right, which is not supported by cudnn.");
|
||||
// dilated convolution supported by some algorithms in cuDNN v6
|
||||
#if !(CUDNN_VERSION_MIN(6,0,0))
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
dilation_h_ == 1 && dilation_w_ == 1,
|
||||
"The cudnn convolution does not support dilation yet.");
|
||||
#endif
|
||||
|
||||
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
|
||||
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
|
||||
|
|
@ -270,9 +273,15 @@ bool CudnnConvOp<T>::RunOnDevice() {
|
|||
H_out,
|
||||
W_out));
|
||||
// Set the convolution descriptor
|
||||
#if CUDNN_VERSION_MIN(6,0,0)
|
||||
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_,
|
||||
CUDNN_CROSS_CORRELATION, cudnnTypeWrapper<T>::type));
|
||||
#else
|
||||
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
|
||||
CUDNN_CROSS_CORRELATION));
|
||||
#endif
|
||||
if (deterministic_) {
|
||||
algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
||||
} else if (exhaustive_search_) {
|
||||
|
|
@ -450,9 +459,15 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
|
|||
H_out,
|
||||
W_out));
|
||||
// Set the convolution descriptor
|
||||
#if CUDNN_VERSION_MIN(6,0,0)
|
||||
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_,
|
||||
CUDNN_CROSS_CORRELATION, cudnnTypeWrapper<T>::type));
|
||||
#else
|
||||
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
|
||||
CUDNN_CROSS_CORRELATION));
|
||||
#endif
|
||||
// Set the workspace
|
||||
|
||||
size_t bwd_filter_ws_size, bwd_data_ws_size;
|
||||
|
|
|
|||
|
|
@ -228,6 +228,18 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
|
|||
pad_r_,
|
||||
"The current padding scheme leads to unequal padding on the left "
|
||||
"and right, which is not supported by cudnn.");
|
||||
#if CUDNN_VERSION_MIN(6,0,0)
|
||||
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
1,
|
||||
1,
|
||||
CUDNN_CROSS_CORRELATION,
|
||||
cudnnTypeWrapper<T>::type));
|
||||
#else
|
||||
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_,
|
||||
pad_t_,
|
||||
|
|
@ -237,6 +249,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
|
|||
1,
|
||||
1,
|
||||
CUDNN_CROSS_CORRELATION));
|
||||
#endif
|
||||
if (deterministic_) {
|
||||
bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
|
||||
} else if (exhaustive_search_) {
|
||||
|
|
@ -434,6 +447,18 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
|
|||
pad_r_,
|
||||
"The current padding scheme leads to unequal padding on the left "
|
||||
"and right, which is not supported by cudnn.");
|
||||
#if CUDNN_VERSION_MIN(6,0,0)
|
||||
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
1,
|
||||
1,
|
||||
CUDNN_CROSS_CORRELATION,
|
||||
cudnnTypeWrapper<T>::type));
|
||||
#else
|
||||
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_,
|
||||
pad_t_,
|
||||
|
|
@ -443,6 +468,7 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
|
|||
1,
|
||||
1,
|
||||
CUDNN_CROSS_CORRELATION));
|
||||
#endif
|
||||
// Set the workspace
|
||||
|
||||
size_t bwd_filter_ws_size, fwd_ws_size;
|
||||
|
|
|
|||
|
|
@ -301,6 +301,11 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
|
|||
Output(GRAD_WEIGHT)->template mutable_data<T>(),
|
||||
&context_);
|
||||
|
||||
#if CUDNN_VERSION_MIN(6,0,0)
|
||||
auto * reserve = Output(RNN_SCRATCH_OUT)->template mutable_data<T>();
|
||||
#else
|
||||
const auto * reserve = Output(RNN_SCRATCH_OUT)->template data<T>();
|
||||
#endif
|
||||
cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
|
||||
CUDNN_CHECK(cudnnRNNBackwardData(
|
||||
state->cudnn_handle(),
|
||||
|
|
@ -328,7 +333,7 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
|
|||
Output(GRAD_CELL_INPUT)->template mutable_data<T>(),
|
||||
state->workspace().get(cudnnWsNbytes_),
|
||||
cudnnWsNbytes_,
|
||||
Input(RNN_SCRATCH).template data<T>(),
|
||||
reserve,
|
||||
reserveNbytes_));
|
||||
CUDNN_CHECK(cudnnRNNBackwardWeights(
|
||||
state->cudnn_handle(),
|
||||
|
|
@ -344,7 +349,7 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
|
|||
cudnnWsNbytes_,
|
||||
wDesc_,
|
||||
Output(GRAD_WEIGHT)->template mutable_data<T>(),
|
||||
Input(RNN_SCRATCH).template data<T>(),
|
||||
reserve,
|
||||
reserveNbytes_));
|
||||
});
|
||||
return true;
|
||||
|
|
@ -426,7 +431,7 @@ input_mode) are passed directly through to CuDNN.
|
|||
|
||||
)DOC");
|
||||
REGISTER_CUDNN_OPERATOR(RecurrentGradient, RecurrentGradientOp<float>);
|
||||
OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(5);
|
||||
OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(6);
|
||||
REGISTER_CUDNN_OPERATOR(RecurrentInit, RecurrentInitOp<float>);
|
||||
OPERATOR_SCHEMA(RecurrentInit).NumInputs(1).NumOutputs(2);
|
||||
|
||||
|
|
@ -450,7 +455,8 @@ struct GetRecurrentGradient : public GradientMakerBase {
|
|||
GI(1), // GRAD_HIDDEN_INPUT
|
||||
GI(2), // GRAD_CELL_INPUT
|
||||
GI(3), // GRAD_WEIGHT
|
||||
O(4) // DROPOUT_STATES
|
||||
O(4), // DROPOUT_STATES
|
||||
O(3) // RNN_SCRATCH
|
||||
});
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -119,7 +119,8 @@ class RecurrentGradientOp : public RecurrentBaseOp<T> {
|
|||
GRAD_HIDDEN_INPUT,
|
||||
GRAD_CELL_INPUT,
|
||||
GRAD_WEIGHT,
|
||||
DROPOUT_STATES);
|
||||
DROPOUT_STATES,
|
||||
RNN_SCRATCH_OUT);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
|
|
|||
|
|
@ -153,7 +153,9 @@ class TestConvolution(hu.HypothesisTestCase):
|
|||
order, engine, use_bias, gc, dc):
|
||||
dkernel = dilation * (kernel - 1) + 1
|
||||
|
||||
assume("" == engine or 1 == dilation)
|
||||
# cuDNN v6+ supports dilated convolutions
|
||||
if (workspace.GetCuDNNVersion() < 6000):
|
||||
assume("" == engine or 1 == dilation)
|
||||
assume(engine != "MKLDNN" or use_bias is True)
|
||||
|
||||
op = core.CreateOperator(
|
||||
|
|
@ -211,8 +213,12 @@ class TestConvolution(hu.HypothesisTestCase):
|
|||
b = np.random.rand(output_channels).astype(np.float32) - 0.5
|
||||
Output = collections.namedtuple("Output", ["Y", "engine", "order"])
|
||||
outputs = []
|
||||
|
||||
# cuDNN v6+ supports dilated convolutions
|
||||
engine_list = ["", "CUDNN"] if ((dilation == 1) or (workspace.GetCuDNNVersion() >= 6000)) else [""]
|
||||
|
||||
for order in ["NCHW", "NHWC"]:
|
||||
for engine in (["", "CUDNN"] if dilation == 1 else [""]):
|
||||
for engine in engine_list:
|
||||
op = core.CreateOperator(
|
||||
"Conv",
|
||||
["X", "w", "b"] if use_bias else ["X", "w"],
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#include <pybind11/stl.h>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/operators/operator_fallback_gpu.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
|
@ -27,6 +28,7 @@ void addCUDAGlobalMethods(py::module& m) {
|
|||
m.def("num_cuda_devices", &NumCudaDevices);
|
||||
m.def("set_default_gpu_id", &SetDefaultGPUID);
|
||||
m.def("get_default_gpu_id", &GetDefaultGPUID);
|
||||
m.def("get_cudnn_version", &cudnnCompiledVersion);
|
||||
m.def("get_cuda_peer_access_pattern", []() {
|
||||
std::vector<std::vector<bool>> pattern;
|
||||
CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ if has_gpu_support:
|
|||
NumCudaDevices = C.num_cuda_devices
|
||||
SetDefaultGPUID = C.set_default_gpu_id
|
||||
GetDefaultGPUID = C.get_default_gpu_id
|
||||
GetCuDNNVersion = C.get_cudnn_version
|
||||
|
||||
def GetCudaPeerAccessPattern():
|
||||
return np.asarray(C.get_cuda_peer_access_pattern())
|
||||
|
|
|
|||
Loading…
Reference in a new issue