Summary:
Add cudnn v6 support, including testing support for dilated convolution.
Add a check to ensure that the versions of cuDNN used to compile Caffe2 and run it are compatible
Closes https://github.com/caffe2/caffe2/pull/85

Reviewed By: bwasti

Differential Revision: D4387690

Pulled By: Yangqing

fbshipit-source-id: 312960134398dd4afe6ee0c01cdc160046c904e8
This commit is contained in:
Simon Layton 2017-02-28 17:38:44 -08:00 committed by Facebook Github Bot
parent 1c92e85dae
commit fbf47a8825
9 changed files with 90 additions and 7 deletions

View file

@ -18,6 +18,9 @@ static_assert(
CUDNN_VERSION >= 5000,
"Caffe2 requires cudnn version 5.0 or above.");
#define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
namespace caffe2 {
namespace internal {
@ -70,6 +73,25 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
::caffe2::internal::cudnnGetErrorString(status)); \
} while (0)
// report the version of cuDNN Caffe2 was compiled with
inline size_t cudnnCompiledVersion() {
return CUDNN_VERSION;
}
// report the runtime version of cuDNN
inline size_t cudnnRuntimeVersion() {
return cudnnGetVersion();
}
// Check compatibility of compiled and runtime cuDNN versions
inline void CheckCuDNNVersions() {
// Version format is major*1000 + minor*100 + patch
// Major, minor and patch versions must all match
bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
CAFFE_ENFORCE(version_match,
"cuDNN compiled (", cudnnCompiledVersion(), ") and"
"runtime (", cudnnRuntimeVersion(), ") versions mismatch");
}
/**
* cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type
* in a template function. The class is specialized explicitly for different

View file

@ -10,6 +10,7 @@
#endif // CAFFE2_USE_CNMEM
#include "caffe2/core/asan.h"
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
@ -139,6 +140,9 @@ static void Caffe2InitializeCuda() {
TypeMeta::Id<Tensor<CUDAContext>>(),
GetTensorShape<CUDAContext>
);
// Check the versions of cuDNN that were compiled and linked with are compatible
CheckCuDNNVersions();
}
#ifdef CAFFE2_USE_CNMEM

View file

@ -56,9 +56,12 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
pad_l_ == pad_r_,
"The current padding scheme leads to unequal padding on the left "
"and right, which is not supported by cudnn.");
// dilated convolution supported by some algorithms in cuDNN v6
#if !(CUDNN_VERSION_MIN(6,0,0))
OPERATOR_NEEDS_FEATURE(
dilation_h_ == 1 && dilation_w_ == 1,
"The cudnn convolution does not support dilation yet.");
#endif
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
@ -270,9 +273,15 @@ bool CudnnConvOp<T>::RunOnDevice() {
H_out,
W_out));
// Set the convolution descriptor
#if CUDNN_VERSION_MIN(6,0,0)
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_,
CUDNN_CROSS_CORRELATION, cudnnTypeWrapper<T>::type));
#else
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
CUDNN_CROSS_CORRELATION));
#endif
if (deterministic_) {
algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (exhaustive_search_) {
@ -450,9 +459,15 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
H_out,
W_out));
// Set the convolution descriptor
#if CUDNN_VERSION_MIN(6,0,0)
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_,
CUDNN_CROSS_CORRELATION, cudnnTypeWrapper<T>::type));
#else
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
CUDNN_CROSS_CORRELATION));
#endif
// Set the workspace
size_t bwd_filter_ws_size, bwd_data_ws_size;

View file

@ -228,6 +228,18 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
pad_r_,
"The current padding scheme leads to unequal padding on the left "
"and right, which is not supported by cudnn.");
#if CUDNN_VERSION_MIN(6,0,0)
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
pad_l_,
stride_h_,
stride_w_,
1,
1,
CUDNN_CROSS_CORRELATION,
cudnnTypeWrapper<T>::type));
#else
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
@ -237,6 +249,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
1,
1,
CUDNN_CROSS_CORRELATION));
#endif
if (deterministic_) {
bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} else if (exhaustive_search_) {
@ -434,6 +447,18 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
pad_r_,
"The current padding scheme leads to unequal padding on the left "
"and right, which is not supported by cudnn.");
#if CUDNN_VERSION_MIN(6,0,0)
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
pad_l_,
stride_h_,
stride_w_,
1,
1,
CUDNN_CROSS_CORRELATION,
cudnnTypeWrapper<T>::type));
#else
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
@ -443,6 +468,7 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
1,
1,
CUDNN_CROSS_CORRELATION));
#endif
// Set the workspace
size_t bwd_filter_ws_size, fwd_ws_size;

View file

@ -301,6 +301,11 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
Output(GRAD_WEIGHT)->template mutable_data<T>(),
&context_);
#if CUDNN_VERSION_MIN(6,0,0)
auto * reserve = Output(RNN_SCRATCH_OUT)->template mutable_data<T>();
#else
const auto * reserve = Output(RNN_SCRATCH_OUT)->template data<T>();
#endif
cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
CUDNN_CHECK(cudnnRNNBackwardData(
state->cudnn_handle(),
@ -328,7 +333,7 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
Output(GRAD_CELL_INPUT)->template mutable_data<T>(),
state->workspace().get(cudnnWsNbytes_),
cudnnWsNbytes_,
Input(RNN_SCRATCH).template data<T>(),
reserve,
reserveNbytes_));
CUDNN_CHECK(cudnnRNNBackwardWeights(
state->cudnn_handle(),
@ -344,7 +349,7 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
cudnnWsNbytes_,
wDesc_,
Output(GRAD_WEIGHT)->template mutable_data<T>(),
Input(RNN_SCRATCH).template data<T>(),
reserve,
reserveNbytes_));
});
return true;
@ -426,7 +431,7 @@ input_mode) are passed directly through to CuDNN.
)DOC");
REGISTER_CUDNN_OPERATOR(RecurrentGradient, RecurrentGradientOp<float>);
OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(5);
OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(6);
REGISTER_CUDNN_OPERATOR(RecurrentInit, RecurrentInitOp<float>);
OPERATOR_SCHEMA(RecurrentInit).NumInputs(1).NumOutputs(2);
@ -450,7 +455,8 @@ struct GetRecurrentGradient : public GradientMakerBase {
GI(1), // GRAD_HIDDEN_INPUT
GI(2), // GRAD_CELL_INPUT
GI(3), // GRAD_WEIGHT
O(4) // DROPOUT_STATES
O(4), // DROPOUT_STATES
O(3) // RNN_SCRATCH
});
}
};

View file

@ -119,7 +119,8 @@ class RecurrentGradientOp : public RecurrentBaseOp<T> {
GRAD_HIDDEN_INPUT,
GRAD_CELL_INPUT,
GRAD_WEIGHT,
DROPOUT_STATES);
DROPOUT_STATES,
RNN_SCRATCH_OUT);
};
template <typename T>

View file

@ -153,7 +153,9 @@ class TestConvolution(hu.HypothesisTestCase):
order, engine, use_bias, gc, dc):
dkernel = dilation * (kernel - 1) + 1
assume("" == engine or 1 == dilation)
# cuDNN v6+ supports dilated convolutions
if (workspace.GetCuDNNVersion() < 6000):
assume("" == engine or 1 == dilation)
assume(engine != "MKLDNN" or use_bias is True)
op = core.CreateOperator(
@ -211,8 +213,12 @@ class TestConvolution(hu.HypothesisTestCase):
b = np.random.rand(output_channels).astype(np.float32) - 0.5
Output = collections.namedtuple("Output", ["Y", "engine", "order"])
outputs = []
# cuDNN v6+ supports dilated convolutions
engine_list = ["", "CUDNN"] if ((dilation == 1) or (workspace.GetCuDNNVersion() >= 6000)) else [""]
for order in ["NCHW", "NHWC"]:
for engine in (["", "CUDNN"] if dilation == 1 else [""]):
for engine in engine_list:
op = core.CreateOperator(
"Conv",
["X", "w", "b"] if use_bias else ["X", "w"],

View file

@ -10,6 +10,7 @@
#include <pybind11/stl.h>
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/common_cudnn.h"
#include "caffe2/operators/operator_fallback_gpu.h"
namespace caffe2 {
@ -27,6 +28,7 @@ void addCUDAGlobalMethods(py::module& m) {
m.def("num_cuda_devices", &NumCudaDevices);
m.def("set_default_gpu_id", &SetDefaultGPUID);
m.def("get_default_gpu_id", &GetDefaultGPUID);
m.def("get_cudnn_version", &cudnnCompiledVersion);
m.def("get_cuda_peer_access_pattern", []() {
std::vector<std::vector<bool>> pattern;
CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));

View file

@ -34,6 +34,7 @@ if has_gpu_support:
NumCudaDevices = C.num_cuda_devices
SetDefaultGPUID = C.set_default_gpu_id
GetDefaultGPUID = C.get_default_gpu_id
GetCuDNNVersion = C.get_cudnn_version
def GetCudaPeerAccessPattern():
return np.asarray(C.get_cuda_peer_access_pattern())