From fbf47a88252a25c93e6658dfd96ade6009e4052f Mon Sep 17 00:00:00 2001 From: Simon Layton Date: Tue, 28 Feb 2017 17:38:44 -0800 Subject: [PATCH] Cudnn v6 Summary: Add cudnn v6 support, including testing support for dilated convolution. Add a check to ensure that the versions of cuDNN used to compile Caffe2 and run it are compatible Closes https://github.com/caffe2/caffe2/pull/85 Reviewed By: bwasti Differential Revision: D4387690 Pulled By: Yangqing fbshipit-source-id: 312960134398dd4afe6ee0c01cdc160046c904e8 --- caffe2/core/common_cudnn.h | 22 +++++++++++++++++ caffe2/core/context_gpu.cu | 4 ++++ caffe2/operators/conv_op_cudnn.cc | 15 ++++++++++++ caffe2/operators/conv_transpose_op_cudnn.cc | 26 +++++++++++++++++++++ caffe2/operators/recurrent_op_cudnn.cc | 14 +++++++---- caffe2/operators/recurrent_op_cudnn.h | 3 ++- caffe2/python/operator_test/conv_test.py | 10 ++++++-- caffe2/python/pybind_state_gpu.cc | 2 ++ caffe2/python/workspace.py | 1 + 9 files changed, 90 insertions(+), 7 deletions(-) diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h index 6337b1210d5..20db8c8c119 100644 --- a/caffe2/core/common_cudnn.h +++ b/caffe2/core/common_cudnn.h @@ -18,6 +18,9 @@ static_assert( CUDNN_VERSION >= 5000, "Caffe2 requires cudnn version 5.0 or above."); +#define CUDNN_VERSION_MIN(major, minor, patch) \ + (CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch))) + namespace caffe2 { namespace internal { @@ -70,6 +73,25 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { ::caffe2::internal::cudnnGetErrorString(status)); \ } while (0) +// report the version of cuDNN Caffe2 was compiled with +inline size_t cudnnCompiledVersion() { + return CUDNN_VERSION; +} +// report the runtime version of cuDNN +inline size_t cudnnRuntimeVersion() { + return cudnnGetVersion(); +} + +// Check compatibility of compiled and runtime cuDNN versions +inline void CheckCuDNNVersions() { + // Version format is major*1000 + minor*100 + patch + // Major, minor and patch versions must all match + bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion(); + CAFFE_ENFORCE(version_match, + "cuDNN compiled (", cudnnCompiledVersion(), ") and" + "runtime (", cudnnRuntimeVersion(), ") versions mismatch"); +} + /** * cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type * in a template function. The class is specialized explicitly for different diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu index 3cae5c4757a..8f3273bb9e7 100644 --- a/caffe2/core/context_gpu.cu +++ b/caffe2/core/context_gpu.cu @@ -10,6 +10,7 @@ #endif // CAFFE2_USE_CNMEM #include "caffe2/core/asan.h" +#include "caffe2/core/common_cudnn.h" #include "caffe2/core/context_gpu.h" #include "caffe2/core/init.h" #include "caffe2/core/logging.h" @@ -139,6 +140,9 @@ static void Caffe2InitializeCuda() { TypeMeta::Id>(), GetTensorShape ); + + // Check the versions of cuDNN that were compiled and linked with are compatible + CheckCuDNNVersions(); } #ifdef CAFFE2_USE_CNMEM diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc index 6867916356d..bd1e0d6165f 100644 --- a/caffe2/operators/conv_op_cudnn.cc +++ b/caffe2/operators/conv_op_cudnn.cc @@ -56,9 +56,12 @@ class CudnnConvOpBase : public ConvPoolOpBase { pad_l_ == pad_r_, "The current padding scheme leads to unequal padding on the left " "and right, which is not supported by cudnn."); + // dilated convolution supported by some algorithms in cuDNN v6 +#if !(CUDNN_VERSION_MIN(6,0,0)) OPERATOR_NEEDS_FEATURE( dilation_h_ == 1 && dilation_w_ == 1, "The cudnn convolution does not support dilation yet."); +#endif CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); @@ -270,9 +273,15 @@ bool CudnnConvOp::RunOnDevice() { H_out, W_out)); // Set the convolution descriptor +#if CUDNN_VERSION_MIN(6,0,0) + CUDNN_CHECK(cudnnSetConvolution2dDescriptor( + conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_, + CUDNN_CROSS_CORRELATION, cudnnTypeWrapper::type)); +#else CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1, CUDNN_CROSS_CORRELATION)); +#endif if (deterministic_) { algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } else if (exhaustive_search_) { @@ -450,9 +459,15 @@ bool CudnnConvGradientOp::RunOnDevice() { H_out, W_out)); // Set the convolution descriptor +#if CUDNN_VERSION_MIN(6,0,0) + CUDNN_CHECK(cudnnSetConvolution2dDescriptor( + conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_, + CUDNN_CROSS_CORRELATION, cudnnTypeWrapper::type)); +#else CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1, CUDNN_CROSS_CORRELATION)); +#endif // Set the workspace size_t bwd_filter_ws_size, bwd_data_ws_size; diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc index f1fc5386b93..bd07bbb0fe0 100644 --- a/caffe2/operators/conv_transpose_op_cudnn.cc +++ b/caffe2/operators/conv_transpose_op_cudnn.cc @@ -228,6 +228,18 @@ bool CudnnConvTransposeOp::RunOnDevice() { pad_r_, "The current padding scheme leads to unequal padding on the left " "and right, which is not supported by cudnn."); +#if CUDNN_VERSION_MIN(6,0,0) + CUDNN_CHECK(cudnnSetConvolution2dDescriptor( + conv_desc_, + pad_t_, + pad_l_, + stride_h_, + stride_w_, + 1, + 1, + CUDNN_CROSS_CORRELATION, + cudnnTypeWrapper::type)); +#else CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc_, pad_t_, @@ -237,6 +249,7 @@ bool CudnnConvTransposeOp::RunOnDevice() { 1, 1, CUDNN_CROSS_CORRELATION)); +#endif if (deterministic_) { bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else if (exhaustive_search_) { @@ -434,6 +447,18 @@ bool CudnnConvTransposeGradientOp::RunOnDevice() { pad_r_, "The current padding scheme leads to unequal padding on the left " "and right, which is not supported by cudnn."); +#if CUDNN_VERSION_MIN(6,0,0) + CUDNN_CHECK(cudnnSetConvolution2dDescriptor( + conv_desc_, + pad_t_, + pad_l_, + stride_h_, + stride_w_, + 1, + 1, + CUDNN_CROSS_CORRELATION, + cudnnTypeWrapper::type)); +#else CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc_, pad_t_, @@ -443,6 +468,7 @@ bool CudnnConvTransposeGradientOp::RunOnDevice() { 1, 1, CUDNN_CROSS_CORRELATION)); +#endif // Set the workspace size_t bwd_filter_ws_size, fwd_ws_size; diff --git a/caffe2/operators/recurrent_op_cudnn.cc b/caffe2/operators/recurrent_op_cudnn.cc index 5c28aecde8b..332cd17645d 100644 --- a/caffe2/operators/recurrent_op_cudnn.cc +++ b/caffe2/operators/recurrent_op_cudnn.cc @@ -301,6 +301,11 @@ bool RecurrentGradientOp::RunOnDevice() { Output(GRAD_WEIGHT)->template mutable_data(), &context_); +#if CUDNN_VERSION_MIN(6,0,0) + auto * reserve = Output(RNN_SCRATCH_OUT)->template mutable_data(); +#else + const auto * reserve = Output(RNN_SCRATCH_OUT)->template data(); +#endif cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) { CUDNN_CHECK(cudnnRNNBackwardData( state->cudnn_handle(), @@ -328,7 +333,7 @@ bool RecurrentGradientOp::RunOnDevice() { Output(GRAD_CELL_INPUT)->template mutable_data(), state->workspace().get(cudnnWsNbytes_), cudnnWsNbytes_, - Input(RNN_SCRATCH).template data(), + reserve, reserveNbytes_)); CUDNN_CHECK(cudnnRNNBackwardWeights( state->cudnn_handle(), @@ -344,7 +349,7 @@ bool RecurrentGradientOp::RunOnDevice() { cudnnWsNbytes_, wDesc_, Output(GRAD_WEIGHT)->template mutable_data(), - Input(RNN_SCRATCH).template data(), + reserve, reserveNbytes_)); }); return true; @@ -426,7 +431,7 @@ input_mode) are passed directly through to CuDNN. )DOC"); REGISTER_CUDNN_OPERATOR(RecurrentGradient, RecurrentGradientOp); -OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(5); +OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(6); REGISTER_CUDNN_OPERATOR(RecurrentInit, RecurrentInitOp); OPERATOR_SCHEMA(RecurrentInit).NumInputs(1).NumOutputs(2); @@ -450,7 +455,8 @@ struct GetRecurrentGradient : public GradientMakerBase { GI(1), // GRAD_HIDDEN_INPUT GI(2), // GRAD_CELL_INPUT GI(3), // GRAD_WEIGHT - O(4) // DROPOUT_STATES + O(4), // DROPOUT_STATES + O(3) // RNN_SCRATCH }); } }; diff --git a/caffe2/operators/recurrent_op_cudnn.h b/caffe2/operators/recurrent_op_cudnn.h index c75b5ae6808..727320fd3ea 100644 --- a/caffe2/operators/recurrent_op_cudnn.h +++ b/caffe2/operators/recurrent_op_cudnn.h @@ -119,7 +119,8 @@ class RecurrentGradientOp : public RecurrentBaseOp { GRAD_HIDDEN_INPUT, GRAD_CELL_INPUT, GRAD_WEIGHT, - DROPOUT_STATES); + DROPOUT_STATES, + RNN_SCRATCH_OUT); }; template diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index ebedebd0b88..730ab655b2f 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -153,7 +153,9 @@ class TestConvolution(hu.HypothesisTestCase): order, engine, use_bias, gc, dc): dkernel = dilation * (kernel - 1) + 1 - assume("" == engine or 1 == dilation) + # cuDNN v6+ supports dilated convolutions + if (workspace.GetCuDNNVersion() < 6000): + assume("" == engine or 1 == dilation) assume(engine != "MKLDNN" or use_bias is True) op = core.CreateOperator( @@ -211,8 +213,12 @@ class TestConvolution(hu.HypothesisTestCase): b = np.random.rand(output_channels).astype(np.float32) - 0.5 Output = collections.namedtuple("Output", ["Y", "engine", "order"]) outputs = [] + + # cuDNN v6+ supports dilated convolutions + engine_list = ["", "CUDNN"] if ((dilation == 1) or (workspace.GetCuDNNVersion() >= 6000)) else [""] + for order in ["NCHW", "NHWC"]: - for engine in (["", "CUDNN"] if dilation == 1 else [""]): + for engine in engine_list: op = core.CreateOperator( "Conv", ["X", "w", "b"] if use_bias else ["X", "w"], diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc index ac99c4e3332..3a3e9043a69 100644 --- a/caffe2/python/pybind_state_gpu.cc +++ b/caffe2/python/pybind_state_gpu.cc @@ -10,6 +10,7 @@ #include #include "caffe2/core/context_gpu.h" +#include "caffe2/core/common_cudnn.h" #include "caffe2/operators/operator_fallback_gpu.h" namespace caffe2 { @@ -27,6 +28,7 @@ void addCUDAGlobalMethods(py::module& m) { m.def("num_cuda_devices", &NumCudaDevices); m.def("set_default_gpu_id", &SetDefaultGPUID); m.def("get_default_gpu_id", &GetDefaultGPUID); + m.def("get_cudnn_version", &cudnnCompiledVersion); m.def("get_cuda_peer_access_pattern", []() { std::vector> pattern; CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern)); diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index c2232386321..e790b95805c 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -34,6 +34,7 @@ if has_gpu_support: NumCudaDevices = C.num_cuda_devices SetDefaultGPUID = C.set_default_gpu_id GetDefaultGPUID = C.get_default_gpu_id + GetCuDNNVersion = C.get_cudnn_version def GetCudaPeerAccessPattern(): return np.asarray(C.get_cuda_peer_access_pattern())