Cudnn v6

Summary: Add cudnn v6 support, including testing support for dilated convolution. Add a check to ensure that the versions of cuDNN used to compile Caffe2 and run it are compatible Closes https://github.com/caffe2/caffe2/pull/85 Reviewed By: bwasti Differential Revision: D4387690 Pulled By: Yangqing fbshipit-source-id: 312960134398dd4afe6ee0c01cdc160046c904e8
2026-05-14 20:57:59 +00:00 · 2017-02-28 17:38:44 -08:00 · 2017-02-28 17:38:44 -08:00 · fbf47a8825
commit fbf47a8825
parent 1c92e85dae
9 changed files with 90 additions and 7 deletions
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@ -18,6 +18,9 @@ static_assert(
    CUDNN_VERSION >= 5000,
    "Caffe2 requires cudnn version 5.0 or above.");

+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
+
 namespace caffe2 {

 namespace internal {
@ -70,6 +73,25 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
        ::caffe2::internal::cudnnGetErrorString(status)); \
  } while (0)

+// report the version of cuDNN Caffe2 was compiled with
+inline size_t cudnnCompiledVersion() {
+  return CUDNN_VERSION;
+}
+// report the runtime version of cuDNN
+inline size_t cudnnRuntimeVersion() {
+  return cudnnGetVersion();
+}
+
+// Check compatibility of compiled and runtime cuDNN versions
+inline void CheckCuDNNVersions() {
+  // Version format is major*1000 + minor*100 + patch
+  // Major, minor and patch versions must all match
+  bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
+  CAFFE_ENFORCE(version_match,
+                "cuDNN compiled (", cudnnCompiledVersion(), ") and"
+                "runtime (", cudnnRuntimeVersion(), ") versions mismatch");
+}
+
 /**
 * cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type
 * in a template function. The class is specialized explicitly for different
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@ -10,6 +10,7 @@
 #endif // CAFFE2_USE_CNMEM

 #include "caffe2/core/asan.h"
+#include "caffe2/core/common_cudnn.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/logging.h"
@ -139,6 +140,9 @@ static void Caffe2InitializeCuda() {
    TypeMeta::Id<Tensor<CUDAContext>>(),
    GetTensorShape<CUDAContext>
  );
+
+  // Check the versions of cuDNN that were compiled and linked with are compatible
+  CheckCuDNNVersions();
 }

 #ifdef CAFFE2_USE_CNMEM
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@ -56,9 +56,12 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
        pad_l_ == pad_r_,
        "The current padding scheme leads to unequal padding on the left "
        "and right, which is not supported by cudnn.");
+    // dilated convolution supported by some algorithms in cuDNN v6
+#if !(CUDNN_VERSION_MIN(6,0,0))
    OPERATOR_NEEDS_FEATURE(
        dilation_h_ == 1 && dilation_w_ == 1,
        "The cudnn convolution does not support dilation yet.");
+#endif

    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
@ -270,9 +273,15 @@ bool CudnnConvOp<T>::RunOnDevice() {
        H_out,
        W_out));
    // Set the convolution descriptor
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+          conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_,
+          CUDNN_CROSS_CORRELATION, cudnnTypeWrapper<T>::type));
+#else
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
          conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
          CUDNN_CROSS_CORRELATION));
+#endif
    if (deterministic_) {
      algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    } else if (exhaustive_search_) {
@ -450,9 +459,15 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
        H_out,
        W_out));
    // Set the convolution descriptor
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+          conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, dilation_h_, dilation_w_,
+          CUDNN_CROSS_CORRELATION, cudnnTypeWrapper<T>::type));
+#else
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
          conv_desc_, pad_t_, pad_l_, stride_h_, stride_w_, 1, 1,
          CUDNN_CROSS_CORRELATION));
+#endif
    // Set the workspace

    size_t bwd_filter_ws_size, bwd_data_ws_size;
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@ -228,6 +228,18 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
        pad_r_,
        "The current padding scheme leads to unequal padding on the left "
        "and right, which is not supported by cudnn.");
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t_,
+        pad_l_,
+        stride_h_,
+        stride_w_,
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION,
+        cudnnTypeWrapper<T>::type));
+#else
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
        conv_desc_,
        pad_t_,
@ -237,6 +249,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
        1,
        1,
        CUDNN_CROSS_CORRELATION));
+#endif
    if (deterministic_) {
      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    } else if (exhaustive_search_) {
@ -434,6 +447,18 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
        pad_r_,
        "The current padding scheme leads to unequal padding on the left "
        "and right, which is not supported by cudnn.");
+#if CUDNN_VERSION_MIN(6,0,0)
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t_,
+        pad_l_,
+        stride_h_,
+        stride_w_,
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION,
+        cudnnTypeWrapper<T>::type));
+#else
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
        conv_desc_,
        pad_t_,
@ -443,6 +468,7 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
        1,
        1,
        CUDNN_CROSS_CORRELATION));
+#endif
    // Set the workspace

    size_t bwd_filter_ws_size, fwd_ws_size;
--- a/caffe2/operators/recurrent_op_cudnn.cc
+++ b/caffe2/operators/recurrent_op_cudnn.cc
@ -301,6 +301,11 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
      Output(GRAD_WEIGHT)->template mutable_data<T>(),
      &context_);

+#if CUDNN_VERSION_MIN(6,0,0)
+  auto * reserve = Output(RNN_SCRATCH_OUT)->template mutable_data<T>();
+#else
+  const auto * reserve = Output(RNN_SCRATCH_OUT)->template data<T>();
+#endif
  cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
    CUDNN_CHECK(cudnnRNNBackwardData(
        state->cudnn_handle(),
@ -328,7 +333,7 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
        Output(GRAD_CELL_INPUT)->template mutable_data<T>(),
        state->workspace().get(cudnnWsNbytes_),
        cudnnWsNbytes_,
-        Input(RNN_SCRATCH).template data<T>(),
+        reserve,
        reserveNbytes_));
    CUDNN_CHECK(cudnnRNNBackwardWeights(
        state->cudnn_handle(),
@ -344,7 +349,7 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
        cudnnWsNbytes_,
        wDesc_,
        Output(GRAD_WEIGHT)->template mutable_data<T>(),
-        Input(RNN_SCRATCH).template data<T>(),
+        reserve,
        reserveNbytes_));
  });
  return true;
@ -426,7 +431,7 @@ input_mode) are passed directly through to CuDNN.

 )DOC");
 REGISTER_CUDNN_OPERATOR(RecurrentGradient, RecurrentGradientOp<float>);
-OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(5);
+OPERATOR_SCHEMA(RecurrentGradient).NumInputs(9).NumOutputs(6);
 REGISTER_CUDNN_OPERATOR(RecurrentInit, RecurrentInitOp<float>);
 OPERATOR_SCHEMA(RecurrentInit).NumInputs(1).NumOutputs(2);

@ -450,7 +455,8 @@ struct GetRecurrentGradient : public GradientMakerBase {
            GI(1), // GRAD_HIDDEN_INPUT
            GI(2), // GRAD_CELL_INPUT
            GI(3), // GRAD_WEIGHT
-            O(4) // DROPOUT_STATES
+            O(4), // DROPOUT_STATES
+            O(3) // RNN_SCRATCH
        });
  }
 };
--- a/caffe2/operators/recurrent_op_cudnn.h
+++ b/caffe2/operators/recurrent_op_cudnn.h
@ -119,7 +119,8 @@ class RecurrentGradientOp : public RecurrentBaseOp<T> {
      GRAD_HIDDEN_INPUT,
      GRAD_CELL_INPUT,
      GRAD_WEIGHT,
-      DROPOUT_STATES);
+      DROPOUT_STATES,
+      RNN_SCRATCH_OUT);
 };

 template <typename T>
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@ -153,7 +153,9 @@ class TestConvolution(hu.HypothesisTestCase):
                                   order, engine, use_bias, gc, dc):
        dkernel = dilation * (kernel - 1) + 1

-        assume("" == engine or 1 == dilation)
+        # cuDNN v6+ supports dilated convolutions
+        if (workspace.GetCuDNNVersion() < 6000):
+            assume("" == engine or 1 == dilation)
        assume(engine != "MKLDNN" or use_bias is True)

        op = core.CreateOperator(
@ -211,8 +213,12 @@ class TestConvolution(hu.HypothesisTestCase):
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        Output = collections.namedtuple("Output", ["Y", "engine", "order"])
        outputs = []
+
+        # cuDNN v6+ supports dilated convolutions
+        engine_list = ["", "CUDNN"] if ((dilation == 1) or (workspace.GetCuDNNVersion() >= 6000)) else [""]
+
        for order in ["NCHW", "NHWC"]:
-            for engine in (["", "CUDNN"] if dilation == 1 else [""]):
+            for engine in engine_list:
                op = core.CreateOperator(
                    "Conv",
                    ["X", "w", "b"] if use_bias else ["X", "w"],
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@ -10,6 +10,7 @@
 #include <pybind11/stl.h>

 #include "caffe2/core/context_gpu.h"
+#include "caffe2/core/common_cudnn.h"
 #include "caffe2/operators/operator_fallback_gpu.h"

 namespace caffe2 {
@ -27,6 +28,7 @@ void addCUDAGlobalMethods(py::module& m) {
  m.def("num_cuda_devices", &NumCudaDevices);
  m.def("set_default_gpu_id", &SetDefaultGPUID);
  m.def("get_default_gpu_id", &GetDefaultGPUID);
+  m.def("get_cudnn_version", &cudnnCompiledVersion);
  m.def("get_cuda_peer_access_pattern", []() {
    std::vector<std::vector<bool>> pattern;
    CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@ -34,6 +34,7 @@ if has_gpu_support:
    NumCudaDevices = C.num_cuda_devices
    SetDefaultGPUID = C.set_default_gpu_id
    GetDefaultGPUID = C.get_default_gpu_id
+    GetCuDNNVersion = C.get_cudnn_version

    def GetCudaPeerAccessPattern():
        return np.asarray(C.get_cuda_peer_access_pattern())