Remove all cuDNN specific inputs to RNN functions (#10581)

Summary: This is still not the final PR, but it removes all blockers for actually using the RNN functions directly in the JIT. Next patch should be final, and will actually remove the symbolic_override code, and change it to proper symbolics for those ATen functions. Turns out the symbolic code can be also cleaned up a bit, and I'll do that too. zdevito ezyang colesbury (for minor DispatchStub.h) changes There was no way to handle those in the JIT for now, and they turned out to be completely unnecessary. It should make the Python and C++ module code much simpler too, since all the logic is now centralized in the native functions. The downside is that RNN modules no longer own their dropout buffers, which are shared per-device instead (with appropriate locking and synchronization). This might appear as a perf regression at first, but in reality it's highly unlikely that anyone will want to run cuDNN RNNs on the same GPU in parallel. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10581 Reviewed By: colesbury Differential Revision: D9365541 Pulled By: apaszke fbshipit-source-id: 3ef8677ee5481bae60c74a9117a2508665b476b5
2026-05-14 20:57:59 +00:00 · 2018-08-17 11:07:31 -07:00 · 2018-08-17 11:07:31 -07:00 · d35f365ad5
commit d35f365ad5
parent 52058204d6
12 changed files with 531 additions and 161 deletions
--- a/aten/src/ATen/cuda/CUDAEvent.cpp
+++ b/aten/src/ATen/cuda/CUDAEvent.cpp
@ -0,0 +1,66 @@
+#include "ATen/cuda/CUDAEvent.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/CUDAStream.h"
+#include "ATen/cuda/Exceptions.h"
+#include "ATen/core/Error.h"
+
+#include <mutex>
+#include <atomic>
+
+// Internal implementation is entirely hidden
+struct CUDAEventInternals {
+  std::atomic<int> refcount;
+  int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t
+  cudaEvent_t event;
+};
+
+namespace at {
+namespace cuda {
+
+namespace detail {
+
+/*
+* Pointer-based event API
+*/
+CUDAEventInternals* CUDAEvent_create(unsigned int flags) {
+  std::unique_ptr<CUDAEventInternals> internals { new CUDAEventInternals() };
+  internals->refcount = 1;
+  internals->device = current_device();
+  AT_CUDA_CHECK(cudaEventCreateWithFlags(&internals->event, flags));
+  return internals.release();
+}
+
+void CUDAEvent_retain(CUDAEventInternals* internals) {
+  internals->refcount++;
+}
+
+void CUDAEvent_uncheckedFree(CUDAEventInternals* internals) {
+  if (--internals->refcount) {
+    cudaEventDestroy(internals->event);
+  }
+}
+cudaEvent_t CUDAEvent_event(CUDAEventInternals* internals) {
+  return internals->event;
+}
+
+int64_t CUDAEvent_device(CUDAEventInternals* internals) {
+  return internals->device;
+}
+
+void CUDAEvent_record(CUDAEventInternals* internals, const CUDAStream& stream) {
+  AT_CUDA_CHECK(cudaEventRecord(internals->event, stream));
+}
+
+} // namespace detail
+
+void CUDAEvent::record() const {
+  record(getCurrentCUDAStream());
+}
+
+void CUDAEvent::record(const CUDAStream& stream) const {
+  detail::CUDAEvent_record(internals_, stream);
+}
+
+
+} // namespace cuda
+} // namespace at
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@ -0,0 +1,84 @@
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include "cuda_runtime_api.h"
+
+#include <ATen/ATenGeneral.h>
+#include <ATen/Error.h>
+
+/*
+* A CUDA event interface with no CUDA build dependency.
+*
+* Includes the CUDAEvent RAII class and a pointer-based event API.
+*/
+
+struct CUDAEventInternals;
+
+namespace at {
+namespace cuda {
+
+struct CUDAStream;
+
+namespace detail {
+
+// Pointer-based API (for internal use)
+// Note: ATen/Context is preferred to work with streams safely
+AT_API CUDAEventInternals* CUDAEvent_create(unsigned int flags);
+AT_API void CUDAEvent_retain(CUDAEventInternals* internals);
+AT_API void CUDAEvent_uncheckedFree(CUDAEventInternals* internals);
+AT_API cudaEvent_t CUDAEvent_event(CUDAEventInternals* internals);
+AT_API int64_t CUDAEvent_device(CUDAEventInternals* internals);
+
+} // namespace detail
+
+struct CUDAEvent {
+  // Constants
+  static constexpr unsigned int DEFAULT_FLAGS = cudaEventDisableTiming;
+
+  // Constructors
+  CUDAEvent(unsigned int flags = DEFAULT_FLAGS)
+    : internals_(detail::CUDAEvent_create(flags)) {}
+
+  ~CUDAEvent() { detail::CUDAEvent_uncheckedFree(internals_); }
+
+  CUDAEvent(const CUDAEvent& other) {
+    detail::CUDAEvent_retain(other.internals_);
+    internals_ = other.internals_;
+  }
+
+  CUDAEvent(CUDAEvent&& other) {
+    std::swap(internals_, other.internals_);
+  }
+
+  CUDAEvent& operator=(CUDAEvent other) noexcept {
+    std::swap(internals_, other.internals_);
+    return *this;
+  }
+
+  explicit operator bool() const noexcept {
+    return internals_ != nullptr;
+  }
+
+  operator cudaEvent_t() const { return detail::CUDAEvent_event(internals_); }
+
+  // Less than operator (to allow use in sets)
+  friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) {
+    return left.internals_ < right.internals_;
+  }
+
+  int64_t device() const { return detail::CUDAEvent_device(internals_); }
+  cudaEvent_t event() const { return detail::CUDAEvent_event(internals_); }
+  CUDAEventInternals* internals() const { return internals_; }
+
+  void record() const; // Record on the current stream
+  void record(const CUDAStream& stream) const;
+
+private:
+  CUDAEventInternals* internals_;
+};
+
+} // namespace cuda
+} // namespace at
+
--- a/aten/src/ATen/cuda/CUDAStream.cpp
+++ b/aten/src/ATen/cuda/CUDAStream.cpp
@ -1,5 +1,6 @@
 #include "ATen/cuda/CUDAStream.h"
 #include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/CUDAEvent.h"
 #include "ATen/cuda/Exceptions.h"
 #include "ATen/core/Error.h"

@ -173,6 +174,10 @@ namespace detail {
    }
  }

+  void CUDAStream_synchronize_with(CUDAStreamInternals* ptr, const CUDAEvent& event) {
+    AT_CUDA_CHECK(cudaStreamWaitEvent(ptr->stream, event, 0));
+  }
+
 } // namespace detail

  /*
@ -194,5 +199,9 @@ namespace detail {
    std::swap(internals_, other.internals_);
  }

+  void CUDAStream::synchronize_with(const CUDAEvent& event) const {
+    detail::CUDAStream_synchronize_with(internals_, event);
+  }
+
 } // namespace cuda
 } // namespace at
--- a/aten/src/ATen/cuda/CUDAStream.h
+++ b/aten/src/ATen/cuda/CUDAStream.h
@ -15,12 +15,13 @@
 * The ATen Context interface should be preferred when working with streams.
 */

-// Forward-declares internals
 struct CUDAStreamInternals;

 namespace at {
 namespace cuda {

+struct CUDAEvent;
+
 namespace detail {

 // Pointer-based API (for internal use)
@ -102,6 +103,8 @@ struct CUDAStream {
  cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); }
  CUDAStreamInternals* internals() const { return internals_; }

+  void synchronize_with(const CUDAEvent& event) const;
+
 private:
  CUDAStreamInternals* internals_ = nullptr;
 };
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@ -50,15 +50,15 @@ struct AT_API DispatchStub {
  static_assert(std::is_pointer<FnPtr>::value, "FnPtr should be a pointer type");

  template <typename... ArgTypes>
-  void operator()(Backend backend, ArgTypes... args) {
+  void operator()(Backend backend, ArgTypes&&... args) {
    if (backend == Backend::CPU) {
      if (!cpu_dispatch_ptr) {
        cpu_dispatch_ptr = choose_cpu_impl();
      }
-      (*cpu_dispatch_ptr)(args...);
+      (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
    } else if (backend == Backend::CUDA) {
      AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
-      (*cuda_dispatch_ptr)(args...);
+      (*cuda_dispatch_ptr)(std::forward<ArgTypes>(args)...);
    } else {
      AT_ERROR("DispatchStub: unsupported backend", backend);
    }
@ -109,12 +109,33 @@ struct RegisterDispatch {

 #define DEFINE_DISPATCH(name) struct name name

-#if defined(__CUDACC__)
-#define REGISTER_DISPATCH(name, fn) \
+#define REGISTER_ARCH_DISPATCH(name, arch, fn) \
+  template <> decltype(fn) DispatchStub<decltype(fn), struct name>::arch = fn;
+
+#ifdef HAVE_AVX_CPU_DEFINITION
+#define REGISTER_AVX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX, fn)
+#else
+#define REGISTER_AVX_DISPATCH(name, fn)
+#endif
+
+#ifdef HAVE_AVX2_CPU_DEFINITION
+#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn)
+#else
+#define REGISTER_AVX2_DISPATCH(name, fn)
+#endif
+
+#define REGISTER_NO_CPU_DISPATCH(name, fn_type)                                \
+  REGISTER_ARCH_DISPATCH(name, DEFAULT, static_cast<fn_type>(nullptr))         \
+  REGISTER_AVX_DISPATCH(name, static_cast<fn_type>(nullptr))                   \
+  REGISTER_AVX2_DISPATCH(name, static_cast<fn_type>(nullptr))
+
+#define REGISTER_CUDA_DISPATCH(name, fn) \
  static RegisterDispatch<decltype(fn), struct name> name ## __register(name, fn);
+
+#if defined(__CUDACC__)
+#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
 #elif defined(CPU_CAPABILITY)
-#define REGISTER_DISPATCH(name, fn) \
-  template <> decltype(fn) DispatchStub<decltype(fn), struct name>::CPU_CAPABILITY = fn;
+#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #endif


--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1,3 +1,5 @@
+#include "ATen/native/RNN.h"
+
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"

@ -499,100 +501,6 @@ std::tuple<io_type, Tensor, Tensor> _lstm_impl(
  return std::make_tuple(result.outputs, at::stack(hy, 0), at::stack(cy, 0));
 }

-////////////////////////////////////////////////////////////////////////////////
-// CUDNN BINDINGS
-////////////////////////////////////////////////////////////////////////////////
-
-// These must line up with the CUDNN mode codes:
-// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
-enum class CuDNNMode { rnn_relu = 0, rnn_tanh = 1, lstm = 2, gru = 3 };
-
-std::tuple<Tensor, Tensor> unpack_hidden(const Tensor& hidden) {
-  return std::make_tuple(hidden, at::Tensor{});
-}
-
-std::tuple<Tensor, Tensor> unpack_hidden(const tpair_of<Tensor>& hidden) {
-  return hidden;
-}
-
-template<typename hidden_type>
-hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) {
-  static_assert(std::is_same<hidden_type, void>::value, "pack_hidden not implemented for this type");
-  AT_ERROR("NOT IMPLEMENTED");
-}
-
-template<>
-Tensor pack_hidden<Tensor>(const Tensor& hx, const Tensor& cx) {
-  AT_ASSERT(cx.numel() == 0);
-  return hx;
-}
-
-template<>
-tpair_of<Tensor> pack_hidden<tpair_of<Tensor>>(const Tensor& hx, const Tensor& cx) {
-  return std::make_tuple(hx, cx);
-}
-
-const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous "
-                                  "chunk of memory. This means they need to be compacted "
-                                  "at every call, possibly greatly increasing memory usage. "
-                                  "To compact weights again call flatten_parameters().";
-
-template<typename hidden_type>
-LayerOutput<Tensor, hidden_type> _cudnn_impl(
-      const Tensor& input, const Tensor& _batch_sizes,
-      const hidden_type& hidden,
-      TensorList params, bool has_biases,
-      CuDNNMode cudnn_mode, const Tensor& weight_buf, const Tensor& dropout_state,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
-  if (!weight_buf.defined()) {
-    AT_WARN(WEIGHT_FORMAT_WARN);
-  }
-
-  Tensor hx, cx;
-  std::tie(hx, cx) = unpack_hidden(hidden);
-
-  int64_t hidden_size = hx.size(2);
-
-  AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
-  IntList batch_sizes { _batch_sizes.data<int64_t>(), static_cast<size_t>(_batch_sizes.size(0)) };
-  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
-  auto cudnn_output = at::_cudnn_rnn(
-      input, params, has_biases ? 4 : 2, weight_buf,
-      hx, cx, static_cast<int>(cudnn_mode), hidden_size,
-      num_layers, /*batch_first=*/false, dropout_p, train, bidirectional,
-      batch_sizes, dropout_state);
-
-  return {std::get<0>(cudnn_output),
-          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
-}
-
-template<typename hidden_type>
-LayerOutput<Tensor, hidden_type> _cudnn_impl(
-      const Tensor& input,
-      const hidden_type& hidden,
-      TensorList params, bool has_biases,
-      CuDNNMode cudnn_mode, const Tensor& weight_buf, const Tensor& dropout_state,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
-  if (!weight_buf.defined()) {
-    AT_WARN(WEIGHT_FORMAT_WARN);
-  }
-
-  Tensor hx, cx;
-  std::tie(hx, cx) = unpack_hidden(hidden);
-
-  int64_t hidden_size = hx.size(2);
-
-  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
-  auto cudnn_output = at::_cudnn_rnn(
-      input, params, has_biases ? 4 : 2, weight_buf,
-      hx, cx, static_cast<int>(cudnn_mode), hidden_size,
-      num_layers, batch_first, dropout_p, train, bidirectional,
-      /*batch_sizes=*/{}, dropout_state);
-
-  return {std::get<0>(cudnn_output),
-          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
-}
-
 } // anonymous namespace

 ////////////////////////////////////////////////////////////////////////////////
@ -600,16 +508,20 @@ LayerOutput<Tensor, hidden_type> _cudnn_impl(
 ////////////////////////////////////////////////////////////////////////////////

 #define ONE_HIDDEN_RNN(NAME, CELL)                                             \
+DEFINE_DISPATCH(NAME##_cudnn_stub);                                            \
+DEFINE_DISPATCH(NAME##_packed_cudnn_stub);                                     \
+REGISTER_NO_CPU_DISPATCH(NAME##_cudnn_stub, rnn_fn);                           \
+REGISTER_NO_CPU_DISPATCH(NAME##_packed_cudnn_stub, rnn_packed_fn);             \
+                                                                               \
 std::tuple<Tensor, Tensor> NAME(                                               \
      const Tensor& _input, const Tensor& hx,                                  \
      TensorList _params, bool has_biases,                                     \
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first, \
-      const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) {     \
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \
  if (at::cudnn_is_acceptable(_input)) {                                       \
-    auto result = _cudnn_impl(_input, hx, _params, has_biases,                 \
-        CuDNNMode::NAME, cudnn_weight_buf, cudnn_dropout_state,                \
-        num_layers, dropout_p, train, bidirectional, batch_first);             \
-    return std::make_tuple(result.outputs, result.final_hidden);               \
+    Tensor output, hy;                                                         \
+    NAME##_cudnn_stub(_input.type().backend(), output, hy, _input, hx, _params, has_biases, \
+            num_layers, dropout_p, train, bidirectional, batch_first);         \
+    return std::make_tuple(output, hy);                                        \
  }                                                                            \
  auto input = batch_first ? _input.transpose(0, 1) : _input;                  \
  auto params = gather_params(_params, has_biases);                            \
@ -624,12 +536,12 @@ std::tuple<Tensor, Tensor> NAME(                                               \
 std::tuple<Tensor, Tensor> NAME(                                               \
      const Tensor& data, const Tensor& batch_sizes, const Tensor& hx,         \
      TensorList _params, bool has_biases,                                     \
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional,    \
-      const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) {     \
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {  \
  if (at::cudnn_is_acceptable(data)) {                                         \
-    auto result = _cudnn_impl(data, batch_sizes, hx, _params, has_biases,      \
-        CuDNNMode::NAME, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional); \
-    return std::make_tuple(result.outputs, result.final_hidden);               \
+    Tensor output, hy;                                                         \
+    NAME##_packed_cudnn_stub(data.type().backend(), output, hy, data, batch_sizes, hx, \
+            _params, has_biases, num_layers, dropout_p, train, bidirectional); \
+    return std::make_tuple(output, hy);                                        \
  }                                                                            \
  PackedSequence input { data, batch_sizes };                                  \
  auto params = gather_params(_params, has_biases);                            \
@ -643,16 +555,21 @@ ONE_HIDDEN_RNN(gru, GRUCell)
 ONE_HIDDEN_RNN(rnn_tanh, SimpleCell<tanh_f>)
 ONE_HIDDEN_RNN(rnn_relu, SimpleCell<relu_f>)

+DEFINE_DISPATCH(lstm_cudnn_stub);
+DEFINE_DISPATCH(lstm_packed_cudnn_stub);
+REGISTER_NO_CPU_DISPATCH(lstm_cudnn_stub, lstm_fn);
+REGISTER_NO_CPU_DISPATCH(lstm_packed_cudnn_stub, lstm_packed_fn);
+
 std::tuple<Tensor, Tensor, Tensor> lstm(
      const Tensor& _input, TensorList hx,
      TensorList _params, bool has_biases,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first,
-      const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) {
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
  AT_CHECK(hx.size() == 2, "lstm expects two hidden states");
  if (at::cudnn_is_acceptable(_input)) {
-    auto result = _cudnn_impl(_input, std::make_tuple(hx[0], hx[1]), _params, has_biases,
-        CuDNNMode::lstm, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional, batch_first);
-    return std::make_tuple(result.outputs, std::get<0>(result.final_hidden), std::get<1>(result.final_hidden));
+    Tensor output, hy, cy;
+    lstm_cudnn_stub(_input.type().backend(), output, hy, cy, _input, hx, _params, has_biases,
+            num_layers, dropout_p, train, bidirectional, batch_first);
+    return std::make_tuple(output, hy, cy);
  }
  auto input = batch_first ? _input.transpose(0, 1) : _input;
  auto params = gather_params(_params, has_biases);
@ -667,13 +584,13 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
 std::tuple<Tensor, Tensor, Tensor> lstm(
      const Tensor& data, const Tensor& batch_sizes, TensorList hx,
      TensorList _params, bool has_biases,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional,
-      const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) {
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
  AT_CHECK(hx.size() == 2, "lstm expects two hidden states");
  if (at::cudnn_is_acceptable(data)) {
-    auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]), _params, has_biases,
-        CuDNNMode::lstm, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional);
-    return std::make_tuple(result.outputs, std::get<0>(result.final_hidden), std::get<1>(result.final_hidden));
+    Tensor output, hy, cy;
+    lstm_packed_cudnn_stub(data.type().backend(), output, hy, cy, data, batch_sizes, hx,
+            _params, has_biases, num_layers, dropout_p, train, bidirectional);
+    return std::make_tuple(output, hy, cy);
  }
  PackedSequence input { data, batch_sizes };
  auto params = gather_params(_params, has_biases);
--- a/aten/src/ATen/native/RNN.h
+++ b/aten/src/ATen/native/RNN.h
@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at { namespace native {
+
+using lstm_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool, bool);
+using rnn_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool, bool);
+using lstm_packed_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool);
+using rnn_packed_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool);
+
+DECLARE_DISPATCH(lstm_fn, lstm_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, gru_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_tanh_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_relu_cudnn_stub);
+DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, gru_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_cudnn_stub);
+
+}} // namespace at::native
+
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -6,6 +6,8 @@
 #include <ATen/core/Error.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <ATen/native/RNN.h>

 #if !AT_CUDNN_ENABLED()

@ -451,7 +453,7 @@ namespace {
          // (same for the hh weights, and the ih and hh biases).
          // Since we're storing all the weights in a single tensor anyway,
          // might as well merge the CUDNN ones into a single tensor as well
-	  int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data<int>();
+          int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data<int>();
          if (linear_id == 0 || linear_id == num_linear_layers / 2) {
            std::initializer_list<int64_t> size = {
              mat_numel * num_linear_layers / 2, 1};
@ -477,6 +479,46 @@ namespace {
    return std::make_pair(params, global_layer_params_count);
  }

+  // This is a lightweight version of the method above used to quickly get the expected
+  // parameter offsets.
+  std::vector<void*> get_expected_data_ptrs(
+        const Tensor& weight_buf, cudnnHandle_t handle, const RNNDescriptorParams& rnn,
+        const RNNDescriptor& rnn_desc, const TensorDescriptor& x_desc, cudnnDataType_t datatype) {
+    FilterDescriptor w_desc;
+    w_desc.set(weight_buf, 3);
+
+    int64_t num_linear_layers = _num_linear_layers(rnn.mode);
+    int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers;
+    const auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams };
+    std::vector<void*> data_ptrs;
+    data_ptrs.reserve(num_dir_layers * 2 * 2);
+    for (int64_t layer = 0; layer < num_dir_layers; layer++) {
+      for (auto cudnn_method : cudnn_methods) {
+        // This API returns a separate pointer for weight of every gate,
+        // but we represent them as a single tensor, so we're only interested
+        // in a very limited subset of possible values.
+        const std::array<int64_t, 2> linear_offsets = { 0, num_linear_layers / 2 };
+        for (int64_t linear_id : linear_offsets) {
+          FilterDescriptor lin_layer_mat_desc;
+          void* matrix_pointer;
+          AT_CUDNN_CHECK(cudnn_method(
+                handle,
+                rnn_desc.desc(),
+                layer,
+                x_desc.desc(),
+                w_desc.desc(),
+                weight_buf.data_ptr(),
+                linear_id,
+                lin_layer_mat_desc.mut_desc(),
+                &matrix_pointer
+                ));
+          data_ptrs.push_back(matrix_pointer);
+        }
+      }
+    }
+    return data_ptrs;
+  }
+
  void _copyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
    AT_ASSERTM(params_from.size(0) == params_to.size(0), "number of layers mismatch");
    for (size_t i = 0; i < params_from.size(0); i++) {
@ -1007,6 +1049,234 @@ Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int
  return dropout_desc.state;
 }

+////////////////////////////////////////////////////////////////////////////////
+// CUDA dispatch for the generic RNN ops (at::lstm, at::gru, ...)
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+// Helpers for working with different hidden types.
+std::tuple<Tensor, Tensor> unpack_hidden(const Tensor& hidden) {
+  return std::make_tuple(hidden, at::Tensor{});
+}
+
+std::tuple<Tensor, Tensor> unpack_hidden(const std::tuple<Tensor, Tensor>& hidden) {
+  return hidden;
+}
+
+template<typename hidden_type>
+hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) {
+  static_assert(std::is_same<hidden_type, void>::value, "pack_hidden not implemented for this type");
+  AT_ERROR("NOT IMPLEMENTED");
+}
+
+template<>
+Tensor pack_hidden<Tensor>(const Tensor& hx, const Tensor& cx) {
+  AT_ASSERT(cx.numel() == 0);
+  return hx;
+}
+
+template<>
+std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor& hx, const Tensor& cx) {
+  return std::make_tuple(hx, cx);
+}
+
+struct DropoutState {
+  at::Tensor buffer;
+  cuda::CUDAEvent event;
+  std::mutex mutex;
+
+  void lock() {
+    mutex.lock();
+    cuda::getCurrentCUDAStream().synchronize_with(event);
+  }
+
+  void unlock() {
+    event.record();
+    mutex.unlock();
+  }
+};
+
+DropoutState& get_dropout_state(const Type& tp, double dropout_p, bool train) {
+  // Each state is slightly over 2MB and initialized lazily, so it's fine to cache them.
+  static std::vector<DropoutState> ten_dropout_state_cache { static_cast<size_t>(cuda::getNumGPUs()) };
+  static std::vector<DropoutState> var_dropout_state_cache { static_cast<size_t>(cuda::getNumGPUs()) };
+  static std::mutex state_cache_mut;
+
+  int device = cuda::current_device();
+  std::unique_lock<std::mutex> lock {state_cache_mut};
+  auto& state = tp.is_variable() ? var_dropout_state_cache.at(device)
+                                 : ten_dropout_state_cache.at(device);
+  if (!state.buffer.defined() && dropout_p > 0 && train) {
+    int64_t seed = at::empty({}, at::kLong).random_().toCLong();
+    state.buffer = at::_cudnn_init_dropout_state(
+      tp.toScalarType(at::kByte), dropout_p, train, seed);
+    // NB: This event will be already constructed by now, but CUDA actually binds the event
+    // to a device at creation time, and all events in the cache initially are assigned to
+    // the one that was active when this function is called for the first time.
+    state.event = cuda::CUDAEvent{};
+  }
+  return state;
+}
+
+Tensor try_get_weight_buf(
+      const Tensor& input, TensorList parameters, bool has_biases,
+      cudnnRNNMode_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional) {
+  // Prepare all relevant descriptors
+  auto handle = getCudnnHandle();
+  auto datatype = getCudnnDataType(input);
+
+  RNNDescriptorParams rnn;
+  rnn.set(mode, hidden_size, num_layers, bidirectional, datatype);
+  RNNDescriptor rnn_desc = rnn.descriptor(handle);
+
+  TensorGeometry x_geom ({1, input.size(-1)});
+  TensorDescriptor x_desc;
+  x_desc.set(datatype, x_geom.sizes(), x_geom.strides(), 5);
+
+  auto num_params = get_num_weights(handle, rnn_desc, x_desc, datatype);
+
+  // Try to get parameter storage
+  auto & any_param = parameters.at(0);
+  auto param_storage = any_param.storage();
+  auto weight_buf = any_param.type().tensor().set_(*param_storage);
+  if (weight_buf.size(0) < num_params) {
+    return {};
+  } else if (weight_buf.size(0) > num_params) {
+    weight_buf = weight_buf.narrow(0, 0, num_params);
+  }
+
+  // Get and check data pointers
+  auto expected_data_ptrs = get_expected_data_ptrs(
+      weight_buf, handle, rnn, rnn_desc, x_desc, datatype);
+
+  int64_t num_parameters = parameters.size();
+  int64_t num_ptrs = expected_data_ptrs.size();
+  AT_ASSERT(num_ptrs == (num_parameters * (has_biases ? 1 : 2)));
+  AT_ASSERT(num_ptrs % (has_biases ? 4 : 2) == 0);
+  for (int64_t param_i = 0, ptr_i = 0;
+       ptr_i < num_ptrs;
+       ptr_i += (has_biases ? 2 : 4), param_i += 2) {
+    if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {};
+    if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {};
+  }
+  if (!parameters[num_parameters - 1].is_contiguous()) return {};
+  return weight_buf;
+}
+
+const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous "
+                                  "chunk of memory. This means they need to be compacted "
+                                  "at every call, possibly greatly increasing memory usage. "
+                                  "To compact weights again call flatten_parameters().";
+
+template<typename hidden_type>
+std::pair<Tensor, hidden_type> _cudnn_impl(
+      const Tensor& input, const Tensor& _batch_sizes, const hidden_type& hidden,
+      TensorList params, bool has_biases, cudnnRNNMode_t mode,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
+  Tensor hx, cx;
+  std::tie(hx, cx) = unpack_hidden(hidden);
+  int64_t hidden_size = hx.size(2);
+
+  auto weight_buf = try_get_weight_buf(
+      input, params, has_biases, mode, hidden_size, num_layers, bidirectional);
+  if (!weight_buf.defined()) {
+    AT_WARN(WEIGHT_FORMAT_WARN);
+  }
+
+  AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
+  IntList batch_sizes { _batch_sizes.data<int64_t>(), static_cast<size_t>(_batch_sizes.size(0)) };
+
+  auto & dropout_state = get_dropout_state(input.type(), dropout_p, train);
+  std::unique_lock<DropoutState> lock { dropout_state };
+  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
+  auto cudnn_output = at::_cudnn_rnn(
+      input, params, has_biases ? 4 : 2, weight_buf,
+      hx, cx, static_cast<int>(mode), hidden_size, num_layers, /*batch_first=*/false,
+      dropout_p, train, bidirectional, batch_sizes, dropout_state.buffer);
+
+  return {std::get<0>(cudnn_output),
+          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
+}
+
+template<typename hidden_type>
+std::pair<Tensor, hidden_type> _cudnn_impl(
+      const Tensor& input, const hidden_type& hidden,
+      TensorList params, bool has_biases, cudnnRNNMode_t mode,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+  Tensor hx, cx;
+  std::tie(hx, cx) = unpack_hidden(hidden);
+  int64_t hidden_size = hx.size(2);
+
+  auto weight_buf = try_get_weight_buf(
+      input, params, has_biases, mode, hidden_size, num_layers, bidirectional);
+  if (!weight_buf.defined()) {
+    AT_WARN(WEIGHT_FORMAT_WARN);
+  }
+
+  auto & dropout_state = get_dropout_state(input.type(), dropout_p, train);
+  std::unique_lock<DropoutState> lock { dropout_state };
+  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
+  auto cudnn_output = at::_cudnn_rnn(
+      input, params, has_biases ? 4 : 2, weight_buf,
+      hx, cx, static_cast<int>(mode), hidden_size, num_layers, batch_first, dropout_p,
+      train, bidirectional, /*batch_sizes=*/{}, dropout_state.buffer);
+
+  return {std::get<0>(cudnn_output),
+          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
+}
+
+#define ONE_HIDDEN_RNN(NAME, MODE)                                             \
+void NAME##_cudnn(Tensor& output, Tensor& hy,                                  \
+      const Tensor& input, const Tensor& hx,                                   \
+      TensorList params, bool has_biases,                                      \
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \
+  std::tie(output, hy) = _cudnn_impl(input, hx, params, has_biases,            \
+      MODE, num_layers, dropout_p, train, bidirectional, batch_first);         \
+}                                                                              \
+                                                                               \
+void NAME##_packed_cudnn(Tensor& output, Tensor& hy,                           \
+      const Tensor& data, const Tensor& batch_sizes, const Tensor& hx,         \
+      TensorList params, bool has_biases,                                      \
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {  \
+  std::tie(output, hy) = _cudnn_impl(data, batch_sizes, hx, params,            \
+      has_biases, MODE, num_layers, dropout_p, train, bidirectional);          \
+}                                                                              \
+                                                                               \
+REGISTER_CUDA_DISPATCH(NAME##_cudnn_stub, &NAME##_cudnn);                      \
+REGISTER_CUDA_DISPATCH(NAME##_packed_cudnn_stub, &NAME##_packed_cudnn);
+
+ONE_HIDDEN_RNN(gru, CUDNN_GRU)
+ONE_HIDDEN_RNN(rnn_tanh, CUDNN_RNN_TANH)
+ONE_HIDDEN_RNN(rnn_relu, CUDNN_RNN_RELU)
+
+void lstm_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
+      const Tensor& input, TensorList hx,
+      TensorList params, bool has_biases,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+  auto result = _cudnn_impl(input, std::make_tuple(hx[0], hx[1]), params, has_biases,
+      CUDNN_LSTM, num_layers, dropout_p, train, bidirectional, batch_first);
+  output = result.first;
+  hy = std::get<0>(result.second);
+  cy = std::get<1>(result.second);
+}
+
+void lstm_packed_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
+      const Tensor& data, const Tensor& batch_sizes, TensorList hx,
+      TensorList params, bool has_biases,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
+  auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]),
+      params, has_biases, CUDNN_LSTM, num_layers, dropout_p, train, bidirectional);
+  output = result.first;
+  hy = std::get<0>(result.second);
+  cy = std::get<1>(result.second);
+}
+
+REGISTER_CUDA_DISPATCH(lstm_cudnn_stub, &lstm_cudnn);
+REGISTER_CUDA_DISPATCH(lstm_packed_cudnn_stub, &lstm_packed_cudnn);
+
+} // anonymous namepsace
+
 }} // namespace at::native

 #endif // AT_CUDNN_ENABLED()
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2053,28 +2053,28 @@
  variants: function

 # RNN cells and layers
- func: lstm(Tensor input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor, Tensor)
+- func: lstm(Tensor input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
  variants: function

- func: lstm(Tensor data, Tensor batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor, Tensor)
+- func: lstm(Tensor data, Tensor batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
  variants: function

- func: gru(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: gru(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
  variants: function

- func: gru(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: gru(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  variants: function

- func: rnn_tanh(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: rnn_tanh(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
  variants: function

- func: rnn_tanh(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: rnn_tanh(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  variants: function

- func: rnn_relu(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: rnn_relu(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
  variants: function

- func: rnn_relu(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: rnn_relu(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  variants: function

 - func: lstm_cell(Tensor input, TensorList hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> (Tensor, Tensor)
--- a/torch/nn/_functions/rnn.py
+++ b/torch/nn/_functions/rnn.py
@ -13,8 +13,7 @@ except ImportError:


 def _select_rnn_impl(mode, input_size, hidden_size, num_layers=1, batch_first=False,
-                     dropout=0, train=True, bidirectional=False, variable_length=False,
-                     dropout_state=None, flat_weight=None):
+                     dropout=0, train=True, bidirectional=False, dropout_state=None):
    hidden_is_tensor = True
    if mode == 'RNN_RELU':
        impl = torch._C._VariableFunctions.rnn_relu
@ -31,18 +30,11 @@ def _select_rnn_impl(mode, input_size, hidden_size, num_layers=1, batch_first=Fa
    def forward(input, weight, hidden, batch_sizes):
        has_biases = len(weight[0]) == 4
        weight = sum(weight, type(weight[0])())
-        if cudnn.is_acceptable(input):
-            dropout_seed = int(torch.IntTensor(1).random_())
-            with torch.cuda.device(input.get_device()):
-                dropout_ts = cudnn.rnn.init_dropout_state(dropout, train, dropout_seed, dropout_state)
+
+        if batch_sizes is None:
+            result = impl(input, hidden, weight, has_biases, num_layers, dropout, train, bidirectional, batch_first)
        else:
-            dropout_ts = None
-        if not variable_length:
-            result = impl(input, hidden, weight, has_biases, num_layers, dropout, train, bidirectional,
-                          batch_first, flat_weight, dropout_ts)
-        else:
-            result = impl(input, batch_sizes, hidden, weight, has_biases, num_layers, dropout, train,
-                          bidirectional, flat_weight, dropout_ts)
+            result = impl(input, batch_sizes, hidden, weight, has_biases, num_layers, dropout, train, bidirectional)
        return result[0], (result[1] if hidden_is_tensor else result[1:])

    return forward
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@ -80,7 +80,6 @@ class RNNBase(Module):
        """
        any_param = next(self.parameters()).data
        if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param):
-            self._data_ptrs = []
            return

        # If any parameters alias, we fall back to the slower, copying code path. This is
@ -89,7 +88,6 @@ class RNNBase(Module):
        # Module.named_parameters().
        unique_data_ptrs = set(p.data_ptr() for l in self.all_weights for p in l)
        if len(unique_data_ptrs) != sum(len(l) for l in self.all_weights):
-            self._data_ptrs = []
            return

        with torch.cuda.device_of(any_param):
@ -108,9 +106,6 @@ class RNNBase(Module):
                    self.input_size, rnn.get_cudnn_mode(self.mode), self.hidden_size, self.num_layers,
                    self.batch_first, bool(self.bidirectional))

-            self._param_buf_size = weight_buf.size(0)
-            self._data_ptrs = list(p.data.data_ptr() for p in self.parameters())
-
    def _apply(self, fn):
        ret = super(RNNBase, self)._apply(fn)
        self.flatten_parameters()
@ -171,14 +166,6 @@ class RNNBase(Module):
            if self.mode == 'LSTM':
                hx = (hx, hx)

-        has_flat_weights = list(p.data.data_ptr() for p in self.parameters()) == self._data_ptrs
-        if has_flat_weights:
-            first_data = next(self.parameters()).data
-            assert first_data.storage().size() == self._param_buf_size
-            flat_weight = first_data.new().set_(first_data.storage(), 0, torch.Size([self._param_buf_size]))
-        else:
-            flat_weight = None
-
        self.check_forward_args(input, hx, batch_sizes)
        func = _get_rnn_impl(
            self.mode,
@ -190,8 +177,6 @@ class RNNBase(Module):
            train=self.training,
            bidirectional=self.bidirectional,
            dropout_state=self.dropout_state,
-            variable_length=is_packed,
-            flat_weight=flat_weight
        )
        output, hidden = func(input, self.all_weights, hx, batch_sizes)
        if is_packed:
@ -214,7 +199,6 @@ class RNNBase(Module):

    def __setstate__(self, d):
        super(RNNBase, self).__setstate__(d)
-        self.__dict__.setdefault('_data_ptrs', [])
        if 'all_weights' in d:
            self._all_weights = d['all_weights']
        if isinstance(self._all_weights[0][0], str):
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@ -1066,6 +1066,7 @@ def rnn_trace_override_symbolic(cell_type, func, sym, g, input, weights, hiddens
    inputs = list(itertools.chain.from_iterable(
        [[input], flattened_weights, hiddens,
            [batch_sizes] if batch_sizes else []]))
+
    outputs = g.wrapPyFuncWithSymbolic(
        forward_flattened_wrapper,
        inputs,