From fbf47a88252a25c93e6658dfd96ade6009e4052f Mon Sep 17 00:00:00 2001
From: Simon Layton <slayton58@gmail.com>
Date: Tue, 28 Feb 2017 17:38:44 -0800
Subject: [PATCH] Cudnn v6

Summary:
Add cudnn v6 support, including testing support for dilated convolution.
Add a check to ensure that the versions of cuDNN used to compile Caffe2 and run it are compatible
Closes https://github.com/caffe2/caffe2/pull/85

Reviewed By: bwasti

Differential Revision: D4387690

Pulled By: Yangqing

fbshipit-source-id: 312960134398dd4afe6ee0c01cdc160046c904e8
---
 caffe2/core/common_cudnn.h                  | 22 +++++++++++++++++
 caffe2/core/context_gpu.cu                  |  4 ++++
 caffe2/operators/conv_op_cudnn.cc           | 15 ++++++++++++
 caffe2/operators/conv_transpose_op_cudnn.cc | 26 +++++++++++++++++++++
 caffe2/operators/recurrent_op_cudnn.cc      | 14 +++++++----
 caffe2/operators/recurrent_op_cudnn.h       |  3 ++-
 caffe2/python/operator_test/conv_test.py    | 10 ++++++--
 caffe2/python/pybind_state_gpu.cc           |  2 ++
 caffe2/python/workspace.py                  |  1 +
 9 files changed, 90 insertions(+), 7 deletions(-)
diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
index 6337b1210d5..20db8c8c119 100644
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@@ -18,6 +18,9 @@ static_assert(
     CUDNN_VERSION >= 5000,
     "Caffe2 requires cudnn version 5.0 or above.");
 
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
+
 namespace caffe2 {
 
 namespace internal {
@@ -70,6 +73,25 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
         ::caffe2::internal::cudnnGetErrorString(status)); \
   } while (0)
 
+// report the version of cuDNN Caffe2 was compiled with
+inline size_t cudnnCompiledVersion() {
+  return CUDNN_VERSION;
+}
+// report the runtime version of cuDNN
+inline size_t cudnnRuntimeVersion() {
+  return cudnnGetVersion();
+}
+
+// Check compatibility of compiled and runtime cuDNN versions
+inline void CheckCuDNNVersions() {
+  // Version format is major*1000 + minor*100 + patch
+  // Major, minor and patch versions must all match
+  bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
+  CAFFE_ENFORCE(version_match,
+                "cuDNN compiled (", cudnnCompiledVersion(), ") and"
+                "runtime (", cudnnRuntimeVersion(), ") versions mismatch");
+}
+
 /**
  * cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type
  * in a template function. The class is specialized explicitly for different
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 3cae5c4757a..8f3273bb9e7 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -10,6 +10,7 @@
 #endif // CAFFE2_USE_CNMEM
 
 #include "caffe2/core/asan.h"
+#include "caffe2/core/common_cudnn.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/logging.h"
@@ -139,6 +140,9 @@ static void Caffe2InitializeCuda() {
     TypeMeta::Id<Tensor<CUDAContext>>(),
     GetTensorShape<CUDAContext>
   );
+
+  // Check the versions of cuDNN that were compiled and linked with are compatible
+  CheckCuDNNVersions();
 }
 
 #ifdef CAFFE2_USE_CNMEM
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
index 6867916356d..bd1e0d6165f 100644
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -56,9 +56,12 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
         pad_l_ == pad_r_,
         "The current padding scheme leads to unequal padding on the left "
         "and right, which is not supported by cudnn.");
+    // dilated convolution supported by some algorithms in cuDNN v6
+#if !(CUDNN_VERSION_MIN(6,0,0))
     OPERATOR_NEEDS_FEATURE(
         dilation_h_ == 1 && dilation_w_ == 1,
         "The cudnn convolution does not support dilation yet.");
+#endif
 
     CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
     CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
@@ -270,9 +273,15 @@ bool CudnnConvOp<T>::RunOnDevice() {
         H_out,
         W_out));
     // Set the convolution descriptor
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+          conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_,
+          CUDNN_CROSS_CORRELATION, cudnnTypeWrapper<T>::type));
+#else
     CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
           conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
           CUDNN_CROSS_CORRELATION));
+#endif
     if (deterministic_) {
       algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else if (exhaustive_search_) {
@@ -450,9 +459,15 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
         H_out,
         W_out));
     // Set the convolution descriptor
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+          conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_,
+          CUDNN_CROSS_CORRELATION, cudnnTypeWrapper<T>::type));
+#else
     CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
           conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
           CUDNN_CROSS_CORRELATION));
+#endif
     // Set the workspace
 
     size_t bwd_filter_ws_size, bwd_data_ws_size;
diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc
index f1fc5386b93..bd07bbb0fe0 100644
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@@ -228,6 +228,18 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
         pad_r_,
         "The current padding scheme leads to unequal padding on the left "
         "and right, which is not supported by cudnn.");
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t_,
+        pad_l_,
+        stride_h_,
+        stride_w_,
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION,
+        cudnnTypeWrapper<T>::type));
+#else
     CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
         conv_desc_,
         pad_t_,
@@ -237,6 +249,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
         1,
         1,
         CUDNN_CROSS_CORRELATION));
+#endif
     if (deterministic_) {
       bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
     } else if (exhaustive_search_) {
@@ -434,6 +447,18 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
         pad_r_,
         "The current padding scheme leads to unequal padding on the left "
         "and right, which is not supported by cudnn.");
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t_,
+        pad_l_,
+        stride_h_,
+        stride_w_,
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION,
+        cudnnTypeWrapper<T>::type));
+#else
     CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
         conv_desc_,
         pad_t_,
@@ -443,6 +468,7 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
         1,
         1,
         CUDNN_CROSS_CORRELATION));
+#endif
     // Set the workspace
 
     size_t bwd_filter_ws_size, fwd_ws_size;
diff --git a/caffe2/operators/recurrent_op_cudnn.cc b/caffe2/operators/recurrent_op_cudnn.cc
index 5c28aecde8b..332cd17645d 100644
--- a/caffe2/operators/recurrent_op_cudnn.cc
+++ b/caffe2/operators/recurrent_op_cudnn.cc
@@ -301,6 +301,11 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
       Output(GRAD_WEIGHT)->template mutable_data<T>(),
       &context_);
 
+#if CUDNN_VERSION_MIN(6,0,0)
+  auto * reserve = Output(RNN_SCRATCH_OUT)->template mutable_data<T>();
+#else
+  const auto * reserve = Output(RNN_SCRATCH_OUT)->template data<T>();
+#endif
   cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
     CUDNN_CHECK(cudnnRNNBackwardData(
         state->cudnn_handle(),
@@ -328,7 +333,7 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
         Output(GRAD_CELL_INPUT)->template mutable_data<T>(),
         state->workspace().get(cudnnWsNbytes_),
         cudnnWsNbytes_,
-        Input(RNN_SCRATCH).template data<T>(),
+        reserve,
         reserveNbytes_));
     CUDNN_CHECK(cudnnRNNBackwardWeights(
         state->cudnn_handle(),
@@ -344,7 +349,7 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
         cudnnWsNbytes_,
         wDesc_,
         Output(GRAD_WEIGHT)->template mutable_data<T>(),
-        Input(RNN_SCRATCH).template data<T>(),
+        reserve,
         reserveNbytes_));
   });
   return true;
@@ -426,7 +431,7 @@ input_mode) are passed directly through to CuDNN.
 
 )DOC");
 REGISTER_CUDNN_OPERATOR(RecurrentGradient, RecurrentGradientOp<float>);
-OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(5);
+OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(6);
 REGISTER_CUDNN_OPERATOR(RecurrentInit, RecurrentInitOp<float>);
 OPERATOR_SCHEMA(RecurrentInit).NumInputs(1).NumOutputs(2);
 
@@ -450,7 +455,8 @@ struct GetRecurrentGradient : public GradientMakerBase {
             GI(1), // GRAD_HIDDEN_INPUT
             GI(2), // GRAD_CELL_INPUT
             GI(3), // GRAD_WEIGHT
-            O(4) // DROPOUT_STATES
+            O(4), // DROPOUT_STATES
+            O(3) // RNN_SCRATCH
         });
   }
 };
diff --git a/caffe2/operators/recurrent_op_cudnn.h b/caffe2/operators/recurrent_op_cudnn.h
index c75b5ae6808..727320fd3ea 100644
--- a/caffe2/operators/recurrent_op_cudnn.h
+++ b/caffe2/operators/recurrent_op_cudnn.h
@@ -119,7 +119,8 @@ class RecurrentGradientOp : public RecurrentBaseOp<T> {
       GRAD_HIDDEN_INPUT,
       GRAD_CELL_INPUT,
       GRAD_WEIGHT,
-      DROPOUT_STATES);
+      DROPOUT_STATES,
+      RNN_SCRATCH_OUT);
 };
 
 template <typename T>
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index ebedebd0b88..730ab655b2f 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -153,7 +153,9 @@ class TestConvolution(hu.HypothesisTestCase):
                                    order, engine, use_bias, gc, dc):
         dkernel = dilation * (kernel - 1) + 1
 
-        assume("" == engine or 1 == dilation)
+        # cuDNN v6+ supports dilated convolutions
+        if (workspace.GetCuDNNVersion() < 6000):
+            assume("" == engine or 1 == dilation)
         assume(engine != "MKLDNN" or use_bias is True)
 
         op = core.CreateOperator(
@@ -211,8 +213,12 @@ class TestConvolution(hu.HypothesisTestCase):
         b = np.random.rand(output_channels).astype(np.float32) - 0.5
         Output = collections.namedtuple("Output", ["Y", "engine", "order"])
         outputs = []
+
+        # cuDNN v6+ supports dilated convolutions
+        engine_list = ["", "CUDNN"] if ((dilation == 1) or (workspace.GetCuDNNVersion() >= 6000)) else [""]
+
         for order in ["NCHW", "NHWC"]:
-            for engine in (["", "CUDNN"] if dilation == 1 else [""]):
+            for engine in engine_list:
                 op = core.CreateOperator(
                     "Conv",
                     ["X", "w", "b"] if use_bias else ["X", "w"],
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
index ac99c4e3332..3a3e9043a69 100644
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@@ -10,6 +10,7 @@
 #include <pybind11/stl.h>
 
 #include "caffe2/core/context_gpu.h"
+#include "caffe2/core/common_cudnn.h"
 #include "caffe2/operators/operator_fallback_gpu.h"
 
 namespace caffe2 {
@@ -27,6 +28,7 @@ void addCUDAGlobalMethods(py::module& m) {
   m.def("num_cuda_devices", &NumCudaDevices);
   m.def("set_default_gpu_id", &SetDefaultGPUID);
   m.def("get_default_gpu_id", &GetDefaultGPUID);
+  m.def("get_cudnn_version", &cudnnCompiledVersion);
   m.def("get_cuda_peer_access_pattern", []() {
     std::vector<std::vector<bool>> pattern;
     CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index c2232386321..e790b95805c 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -34,6 +34,7 @@ if has_gpu_support:
     NumCudaDevices = C.num_cuda_devices
     SetDefaultGPUID = C.set_default_gpu_id
     GetDefaultGPUID = C.get_default_gpu_id
+    GetCuDNNVersion = C.get_cudnn_version
 
     def GetCudaPeerAccessPattern():
         return np.asarray(C.get_cuda_peer_access_pattern())