Add cuDNN handles to CUDAContext

Summary:
Add CUDAContext::cudnn_handle() for easier integration of single
cudnn routines into operators without requiring the weight
of CuDNNWrapper or similar, or needing to spin out a separate CuDNN*Op
version of an operator.

It was necessary to split out the cuDNN wrapper code from the base cuDNN helpers in order to resolve a circular dependency between context_gpu.h and common_cudnn.h when handles and cuDNN `#define` were added.
Closes https://github.com/caffe2/caffe2/pull/1376

Reviewed By: pietern

Differential Revision: D6162034

Pulled By: akyrola

fbshipit-source-id: 95687e55b3e1e921e1f5e0f016f43b586f5f3350
This commit is contained in:
Simon Layton 2017-10-27 11:59:03 -07:00 committed by Facebook Github Bot
parent 86dc6e0837
commit f3078dec64
14 changed files with 202 additions and 196 deletions

View file

@ -15,13 +15,12 @@
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/init.h"
namespace caffe2 {
thread_local CuDNNHandles CuDNNWrapper::tls_cudnn_handles_;
CuDNNWrapper::PerGPUCuDNNStates& CuDNNWrapper::cudnn_states() {
// New it (never delete) to avoid calling the destructors on process
// exit and racing against the CUDA shutdown sequence.

View file

@ -23,9 +23,7 @@
#include <cudnn.h>
#include "caffe2/core/common.h"
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2.pb.h"
@ -316,185 +314,6 @@ class cudnnFilterDescWrapper {
DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
};
class CuDNNWrapper;
/**
* CuDNNHandles wraps around cudnnHandle_t so they can be
* properly destructed when threads exit.
*/
class CuDNNHandles {
friend class CuDNNWrapper;
private:
CuDNNHandles() {
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
cudnn_handle_[i] = nullptr;
}
}
~CuDNNHandles() noexcept {
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
if (cudnn_handle_[i]) {
CUDNN_CHECK(cudnnDestroy(cudnn_handle_[i]));
}
}
}
cudnnHandle_t cudnn_handle_[CAFFE2_COMPILE_TIME_MAX_GPUS];
};
/**
* CuDNNWorkspace is a wrapper around a raw cuda pointer that holds the cudnn
* scratch space. This struct is meant to be only used in CuDNNWrapper to
* provide a program-wide scratch space for CuDNN. The reason behind it is that
* cudnn function calls are usually very efficient, hence one probably does not
* want to run multiple cudnn calls at the same time. As a result, one should
* not need more than one cudnn workspace per device.
*/
struct CuDNNWorkspace {
~CuDNNWorkspace() noexcept {}
void* get(size_t nbytes) {
if (nbytes_ < nbytes) {
reset();
auto data_and_deleter = CUDAContext::New(nbytes);
data_ = {data_and_deleter.first, data_and_deleter.second};
nbytes_ = nbytes;
}
CAFFE_ENFORCE_GE(nbytes_, nbytes);
return data_.get();
}
void reset() {
data_ = nullptr;
nbytes_ = 0;
}
private:
std::unique_ptr<void, MemoryDeleter> data_{nullptr, NoDelete};
size_t nbytes_{0};
};
// CuDNNState is the owner of the CuDNNWorkspace, and serializes all
// executions of operations that use the state onto it's own stream
// (so multiple Net workers can reuse the same workspace from
// different threads and CUDA streams).
class CuDNNState {
public:
explicit CuDNNState(size_t gpu_id) : gpu_id_(gpu_id) {
DeviceGuard g(gpu_id_);
CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
CUDA_ENFORCE(cudaEventCreate(&before_));
CUDA_ENFORCE(cudaEventCreate(&after_));
CUDA_ENFORCE(cudaStreamCreate(&stream_));
CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, stream_));
}
~CuDNNState() noexcept {
DeviceGuard g(gpu_id_);
CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
CUDA_CHECK(cudaStreamDestroy(stream_));
CUDA_CHECK(cudaEventDestroy(after_));
CUDA_CHECK(cudaEventDestroy(before_));
}
cudnnHandle_t& cudnn_handle() {
return cudnn_handle_;
}
CuDNNWorkspace& workspace() {
return workspace_;
}
template <typename F>
void execute(cudaStream_t stream, F&& f) {
CUDA_ENFORCE(cudaEventRecord(before_, stream));
CUDA_ENFORCE(cudaStreamWaitEvent(stream_, before_, 0));
f(this);
CUDA_ENFORCE(cudaEventRecord(after_, stream_));
CUDA_ENFORCE(cudaStreamWaitEvent(stream, after_, 0));
}
private:
cudnnHandle_t cudnn_handle_{nullptr};
cudaEvent_t before_{nullptr};
cudaEvent_t after_{nullptr};
cudaStream_t stream_{nullptr};
CuDNNWorkspace workspace_;
size_t gpu_id_{0};
DISABLE_COPY_AND_ASSIGN(CuDNNState);
};
/**
* CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
*
* The wrapper ensures that for each thread and each gpu, there is one
* identical cudnn handle, which is also associated with the thread-local
* per-device cuda stream. The wrapper also hosts the device-specific cudnn
* workspace (scratch space for some cudnn functions).
*
*/
class CuDNNWrapper {
public:
/**
* Creates a cudnn wrapper associated with a CUDAContext object. Note that
* the CUDAContext object should outlive the CuDNNWrapper.
*/
explicit CuDNNWrapper(CUDAContext* context) : context_(context) {}
/**
* Returns the inline cudnn handle that executes on the current
* thread's cuda_stream.
*/
cudnnHandle_t& inline_cudnn_handle() {
int gpu_id = context_->cuda_gpu_id();
auto& cudnn_handle_ = tls_cudnn_handles_.cudnn_handle_[gpu_id];
if (!cudnn_handle_) {
context_->SwitchToDevice();
CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
}
CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, context_->cuda_stream()));
return cudnn_handle_;
}
// Executes the closure F on the CuDNNState associated with state_idx
template <typename F>
void with_cudnn_state(size_t state_idx, F&& f) {
CAFFE_ENFORCE(
state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
DeviceGuard dg(context_->cuda_gpu_id());
// We need to serialize execution on the CuDNNState as we can't
// allow multiple threads to race through the cudaEventRecord
// calls (so a worker thread might wait on another worker thread's
// execution)
std::lock_guard<std::mutex> g(sync_state.mutex);
if (!sync_state.state.get()) {
sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
}
CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
}
protected:
// Pointer to an external cuda context that the cudnn wrapper will use.
CUDAContext* context_;
static thread_local CuDNNHandles tls_cudnn_handles_;
static constexpr size_t CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES = 4;
struct SyncedCuDNNState {
std::mutex mutex;
std::unique_ptr<CuDNNState> state;
};
using PerGPUCuDNNStates = std::array<
std::array<SyncedCuDNNState, CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES>,
CAFFE2_COMPILE_TIME_MAX_GPUS>;
static PerGPUCuDNNStates& cudnn_states();
DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
};
} // namespace caffe2

View file

@ -20,12 +20,13 @@
#include <ctime>
#include <mutex>
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/tensor.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/logging.h"
namespace caffe2 {
@ -52,16 +53,18 @@ CudaMemoryPoolType GetCudaMemoryPoolType();
*/
class ThreadLocalCUDAObjects {
friend class CUDAContext;
private:
ThreadLocalCUDAObjects() {
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
cuda_streams_[i] = vector<cudaStream_t>();
cublas_handles_[i] = vector<cublasHandle_t>();
cudnn_handles_[i] = vector<cudnnHandle_t>();
}
}
cudaStream_t GetStream(int gpu, int stream_id) {
vector<cudaStream_t> &gpu_streams = cuda_streams_[gpu];
vector<cudaStream_t>& gpu_streams = cuda_streams_[gpu];
if (gpu_streams.size() <= stream_id) {
gpu_streams.resize(stream_id + 1, nullptr);
}
@ -75,7 +78,7 @@ class ThreadLocalCUDAObjects {
cublasHandle_t GetHandle(int gpu, int stream_id) {
DeviceGuard guard(gpu);
vector<cublasHandle_t> &gpu_handles = cublas_handles_[gpu];
vector<cublasHandle_t>& gpu_handles = cublas_handles_[gpu];
if (gpu_handles.size() <= stream_id) {
gpu_handles.resize(stream_id + 1, nullptr);
}
@ -92,6 +95,20 @@ class ThreadLocalCUDAObjects {
return gpu_handles[stream_id];
}
cudnnHandle_t GetCudnnHandle(int gpu, int stream_id) {
DeviceGuard guard(gpu);
vector<cudnnHandle_t>& gpu_handles = cudnn_handles_[gpu];
if (gpu_handles.size() <= stream_id) {
gpu_handles.resize(stream_id + 1, nullptr);
}
if (!gpu_handles[stream_id]) {
CUDNN_ENFORCE(cudnnCreate(&gpu_handles[stream_id]));
CUDNN_ENFORCE(
cudnnSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
}
return gpu_handles[stream_id];
}
~ThreadLocalCUDAObjects() noexcept {
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
for (auto& handle : cublas_handles_[i]) {
@ -104,10 +121,16 @@ class ThreadLocalCUDAObjects {
CUDA_CHECK(cudaStreamDestroy(stream));
}
}
for (auto& handle : cudnn_handles_[i]) {
if (handle) {
CUDNN_CHECK(cudnnDestroy(handle));
}
}
}
}
vector<cudaStream_t> cuda_streams_[CAFFE2_COMPILE_TIME_MAX_GPUS];
vector<cublasHandle_t> cublas_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
vector<cudnnHandle_t> cudnn_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
};
class CUDAContext final {
@ -166,6 +189,10 @@ class CUDAContext final {
return cuda_objects_.GetHandle(gpu_id_, stream_id_);
}
cudnnHandle_t cudnn_handle() {
return cuda_objects_.GetCudnnHandle(gpu_id_, stream_id_);
}
curandGenerator_t& curand_generator() {
if (!curand_generator_) {
DeviceGuard guard(gpu_id_);

View file

@ -0,0 +1,161 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#ifndef CAFFE2_CORE_CUDNN_WRAPPERS_H_
#define CAFFE2_CORE_CUDNN_WRAPPERS_H_
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
class CuDNNWrapper;
/**
* CuDNNWorkspace is a wrapper around a raw cuda pointer that holds the cudnn
* scratch space. This struct is meant to be only used in CuDNNWrapper to
* provide a program-wide scratch space for CuDNN. The reason behind it is that
* cudnn function calls are usually very efficient, hence one probably does not
* want to run multiple cudnn calls at the same time. As a result, one should
* not need more than one cudnn workspace per device.
*/
struct CuDNNWorkspace {
~CuDNNWorkspace() noexcept {}
void* get(size_t nbytes) {
if (nbytes_ < nbytes) {
reset();
auto data_and_deleter = CUDAContext::New(nbytes);
data_ = {data_and_deleter.first, data_and_deleter.second};
nbytes_ = nbytes;
}
CAFFE_ENFORCE_GE(nbytes_, nbytes);
return data_.get();
}
void reset() {
data_ = nullptr;
nbytes_ = 0;
}
private:
std::unique_ptr<void, MemoryDeleter> data_{nullptr, NoDelete};
size_t nbytes_{0};
};
// CuDNNState is the owner of the CuDNNWorkspace, and serializes all
// executions of operations that use the state onto it's own stream
// (so multiple Net workers can reuse the same workspace from
// different threads and CUDA streams).
class CuDNNState {
public:
explicit CuDNNState(size_t gpu_id) : gpu_id_(gpu_id) {
DeviceGuard g(gpu_id_);
CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
CUDA_ENFORCE(cudaEventCreate(&before_));
CUDA_ENFORCE(cudaEventCreate(&after_));
CUDA_ENFORCE(cudaStreamCreate(&stream_));
CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, stream_));
}
~CuDNNState() noexcept {
DeviceGuard g(gpu_id_);
CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
CUDA_CHECK(cudaStreamDestroy(stream_));
CUDA_CHECK(cudaEventDestroy(after_));
CUDA_CHECK(cudaEventDestroy(before_));
}
cudnnHandle_t& cudnn_handle() {
return cudnn_handle_;
}
CuDNNWorkspace& workspace() {
return workspace_;
}
template <typename F>
void execute(cudaStream_t stream, F&& f) {
CUDA_ENFORCE(cudaEventRecord(before_, stream));
CUDA_ENFORCE(cudaStreamWaitEvent(stream_, before_, 0));
f(this);
CUDA_ENFORCE(cudaEventRecord(after_, stream_));
CUDA_ENFORCE(cudaStreamWaitEvent(stream, after_, 0));
}
private:
cudnnHandle_t cudnn_handle_{nullptr};
cudaEvent_t before_{nullptr};
cudaEvent_t after_{nullptr};
cudaStream_t stream_{nullptr};
CuDNNWorkspace workspace_;
size_t gpu_id_{0};
DISABLE_COPY_AND_ASSIGN(CuDNNState);
};
/**
* CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
*
* The wrapper ensures that for each thread and each gpu, there is one
* identical cudnn handle, which is also associated with the thread-local
* per-device cuda stream. The wrapper also hosts the device-specific cudnn
* workspace (scratch space for some cudnn functions).
*
*/
class CuDNNWrapper {
public:
/**
* Creates a cudnn wrapper associated with a CUDAContext object. Note that
* the CUDAContext object should outlive the CuDNNWrapper.
*/
explicit CuDNNWrapper(CUDAContext* context) : context_(context) {}
/**
* Returns the inline cudnn handle that executes on the current
* thread's cuda_stream.
*/
cudnnHandle_t inline_cudnn_handle() {
return context_->cudnn_handle();
}
// Executes the closure F on the CuDNNState associated with state_idx
template <typename F>
void with_cudnn_state(size_t state_idx, F&& f) {
CAFFE_ENFORCE(
state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
DeviceGuard dg(context_->cuda_gpu_id());
// We need to serialize execution on the CuDNNState as we can't
// allow multiple threads to race through the cudaEventRecord
// calls (so a worker thread might wait on another worker thread's
// execution)
std::lock_guard<std::mutex> g(sync_state.mutex);
if (!sync_state.state.get()) {
sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
}
CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
}
protected:
// Pointer to an external cuda context that the cudnn wrapper will use.
CUDAContext* context_;
static constexpr size_t CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES = 4;
struct SyncedCuDNNState {
std::mutex mutex;
std::unique_ptr<CuDNNState> state;
};
using PerGPUCuDNNStates = std::array<
std::array<SyncedCuDNNState, CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES>,
CAFFE2_COMPILE_TIME_MAX_GPUS>;
static PerGPUCuDNNStates& cudnn_states();
DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
};
}; // namespace caffe2
#endif

View file

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/operators/conv_op.h"
#include "caffe2/operators/conv_op_cache_cudnn.h"
#include "caffe2/operators/conv_pool_op_base.h"

View file

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/operators/conv_op_cache_cudnn.h"
#include "caffe2/operators/conv_transpose_op.h"

View file

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"

View file

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"

View file

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include <cub/cub.cuh>

View file

@ -17,9 +17,9 @@
#ifndef CAFFE2_OPERATORS_RECURRENT_OP_CUDNN_H_
#define CAFFE2_OPERATORS_RECURRENT_OP_CUDNN_H_
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"

View file

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"

View file

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/types.h"
#include "caffe2/operators/softmax_op.h"

View file

@ -16,8 +16,8 @@
#include <cfloat>
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/operators/spatial_batch_norm_op.h"
#include "caffe2/utils/math.h"

View file

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/types.h"
#include "caffe2/operators/transpose_op.h"
#include "caffe2/operators/transpose_op_gpu.h"