mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Add cuDNN handles to CUDAContext
Summary: Add CUDAContext::cudnn_handle() for easier integration of single cudnn routines into operators without requiring the weight of CuDNNWrapper or similar, or needing to spin out a separate CuDNN*Op version of an operator. It was necessary to split out the cuDNN wrapper code from the base cuDNN helpers in order to resolve a circular dependency between context_gpu.h and common_cudnn.h when handles and cuDNN `#define` were added. Closes https://github.com/caffe2/caffe2/pull/1376 Reviewed By: pietern Differential Revision: D6162034 Pulled By: akyrola fbshipit-source-id: 95687e55b3e1e921e1f5e0f016f43b586f5f3350
This commit is contained in:
parent
86dc6e0837
commit
f3078dec64
14 changed files with 202 additions and 196 deletions
|
|
@ -15,13 +15,12 @@
|
|||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
thread_local CuDNNHandles CuDNNWrapper::tls_cudnn_handles_;
|
||||
|
||||
CuDNNWrapper::PerGPUCuDNNStates& CuDNNWrapper::cudnn_states() {
|
||||
// New it (never delete) to avoid calling the destructors on process
|
||||
// exit and racing against the CUDA shutdown sequence.
|
||||
|
|
|
|||
|
|
@ -23,9 +23,7 @@
|
|||
#include <cudnn.h>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
|
@ -316,185 +314,6 @@ class cudnnFilterDescWrapper {
|
|||
DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
|
||||
};
|
||||
|
||||
class CuDNNWrapper;
|
||||
/**
|
||||
* CuDNNHandles wraps around cudnnHandle_t so they can be
|
||||
* properly destructed when threads exit.
|
||||
*/
|
||||
class CuDNNHandles {
|
||||
friend class CuDNNWrapper;
|
||||
|
||||
private:
|
||||
CuDNNHandles() {
|
||||
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
|
||||
cudnn_handle_[i] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
~CuDNNHandles() noexcept {
|
||||
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
|
||||
if (cudnn_handle_[i]) {
|
||||
CUDNN_CHECK(cudnnDestroy(cudnn_handle_[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cudnnHandle_t cudnn_handle_[CAFFE2_COMPILE_TIME_MAX_GPUS];
|
||||
};
|
||||
|
||||
/**
|
||||
* CuDNNWorkspace is a wrapper around a raw cuda pointer that holds the cudnn
|
||||
* scratch space. This struct is meant to be only used in CuDNNWrapper to
|
||||
* provide a program-wide scratch space for CuDNN. The reason behind it is that
|
||||
* cudnn function calls are usually very efficient, hence one probably does not
|
||||
* want to run multiple cudnn calls at the same time. As a result, one should
|
||||
* not need more than one cudnn workspace per device.
|
||||
*/
|
||||
struct CuDNNWorkspace {
|
||||
~CuDNNWorkspace() noexcept {}
|
||||
|
||||
void* get(size_t nbytes) {
|
||||
if (nbytes_ < nbytes) {
|
||||
reset();
|
||||
auto data_and_deleter = CUDAContext::New(nbytes);
|
||||
data_ = {data_and_deleter.first, data_and_deleter.second};
|
||||
nbytes_ = nbytes;
|
||||
}
|
||||
CAFFE_ENFORCE_GE(nbytes_, nbytes);
|
||||
return data_.get();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
data_ = nullptr;
|
||||
nbytes_ = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<void, MemoryDeleter> data_{nullptr, NoDelete};
|
||||
size_t nbytes_{0};
|
||||
};
|
||||
|
||||
// CuDNNState is the owner of the CuDNNWorkspace, and serializes all
|
||||
// executions of operations that use the state onto it's own stream
|
||||
// (so multiple Net workers can reuse the same workspace from
|
||||
// different threads and CUDA streams).
|
||||
class CuDNNState {
|
||||
public:
|
||||
explicit CuDNNState(size_t gpu_id) : gpu_id_(gpu_id) {
|
||||
DeviceGuard g(gpu_id_);
|
||||
CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
|
||||
CUDA_ENFORCE(cudaEventCreate(&before_));
|
||||
CUDA_ENFORCE(cudaEventCreate(&after_));
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream_));
|
||||
CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, stream_));
|
||||
}
|
||||
|
||||
~CuDNNState() noexcept {
|
||||
DeviceGuard g(gpu_id_);
|
||||
CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
|
||||
CUDA_CHECK(cudaStreamDestroy(stream_));
|
||||
CUDA_CHECK(cudaEventDestroy(after_));
|
||||
CUDA_CHECK(cudaEventDestroy(before_));
|
||||
}
|
||||
|
||||
cudnnHandle_t& cudnn_handle() {
|
||||
return cudnn_handle_;
|
||||
}
|
||||
|
||||
CuDNNWorkspace& workspace() {
|
||||
return workspace_;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void execute(cudaStream_t stream, F&& f) {
|
||||
CUDA_ENFORCE(cudaEventRecord(before_, stream));
|
||||
CUDA_ENFORCE(cudaStreamWaitEvent(stream_, before_, 0));
|
||||
f(this);
|
||||
CUDA_ENFORCE(cudaEventRecord(after_, stream_));
|
||||
CUDA_ENFORCE(cudaStreamWaitEvent(stream, after_, 0));
|
||||
}
|
||||
|
||||
private:
|
||||
cudnnHandle_t cudnn_handle_{nullptr};
|
||||
cudaEvent_t before_{nullptr};
|
||||
cudaEvent_t after_{nullptr};
|
||||
cudaStream_t stream_{nullptr};
|
||||
CuDNNWorkspace workspace_;
|
||||
size_t gpu_id_{0};
|
||||
DISABLE_COPY_AND_ASSIGN(CuDNNState);
|
||||
};
|
||||
|
||||
/**
|
||||
* CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
|
||||
*
|
||||
* The wrapper ensures that for each thread and each gpu, there is one
|
||||
* identical cudnn handle, which is also associated with the thread-local
|
||||
* per-device cuda stream. The wrapper also hosts the device-specific cudnn
|
||||
* workspace (scratch space for some cudnn functions).
|
||||
*
|
||||
*/
|
||||
class CuDNNWrapper {
|
||||
public:
|
||||
/**
|
||||
* Creates a cudnn wrapper associated with a CUDAContext object. Note that
|
||||
* the CUDAContext object should outlive the CuDNNWrapper.
|
||||
*/
|
||||
explicit CuDNNWrapper(CUDAContext* context) : context_(context) {}
|
||||
|
||||
/**
|
||||
* Returns the inline cudnn handle that executes on the current
|
||||
* thread's cuda_stream.
|
||||
*/
|
||||
cudnnHandle_t& inline_cudnn_handle() {
|
||||
int gpu_id = context_->cuda_gpu_id();
|
||||
auto& cudnn_handle_ = tls_cudnn_handles_.cudnn_handle_[gpu_id];
|
||||
if (!cudnn_handle_) {
|
||||
context_->SwitchToDevice();
|
||||
CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
|
||||
}
|
||||
CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, context_->cuda_stream()));
|
||||
return cudnn_handle_;
|
||||
}
|
||||
|
||||
// Executes the closure F on the CuDNNState associated with state_idx
|
||||
template <typename F>
|
||||
void with_cudnn_state(size_t state_idx, F&& f) {
|
||||
CAFFE_ENFORCE(
|
||||
state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
|
||||
auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
|
||||
|
||||
DeviceGuard dg(context_->cuda_gpu_id());
|
||||
|
||||
// We need to serialize execution on the CuDNNState as we can't
|
||||
// allow multiple threads to race through the cudaEventRecord
|
||||
// calls (so a worker thread might wait on another worker thread's
|
||||
// execution)
|
||||
std::lock_guard<std::mutex> g(sync_state.mutex);
|
||||
if (!sync_state.state.get()) {
|
||||
sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
|
||||
}
|
||||
CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
|
||||
}
|
||||
|
||||
protected:
|
||||
// Pointer to an external cuda context that the cudnn wrapper will use.
|
||||
CUDAContext* context_;
|
||||
static thread_local CuDNNHandles tls_cudnn_handles_;
|
||||
|
||||
static constexpr size_t CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES = 4;
|
||||
|
||||
struct SyncedCuDNNState {
|
||||
std::mutex mutex;
|
||||
std::unique_ptr<CuDNNState> state;
|
||||
};
|
||||
|
||||
using PerGPUCuDNNStates = std::array<
|
||||
std::array<SyncedCuDNNState, CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES>,
|
||||
CAFFE2_COMPILE_TIME_MAX_GPUS>;
|
||||
static PerGPUCuDNNStates& cudnn_states();
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
|
|
|||
|
|
@ -20,12 +20,13 @@
|
|||
#include <ctime>
|
||||
#include <mutex>
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
|
@ -52,16 +53,18 @@ CudaMemoryPoolType GetCudaMemoryPoolType();
|
|||
*/
|
||||
class ThreadLocalCUDAObjects {
|
||||
friend class CUDAContext;
|
||||
|
||||
private:
|
||||
ThreadLocalCUDAObjects() {
|
||||
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
|
||||
cuda_streams_[i] = vector<cudaStream_t>();
|
||||
cublas_handles_[i] = vector<cublasHandle_t>();
|
||||
cudnn_handles_[i] = vector<cudnnHandle_t>();
|
||||
}
|
||||
}
|
||||
|
||||
cudaStream_t GetStream(int gpu, int stream_id) {
|
||||
vector<cudaStream_t> &gpu_streams = cuda_streams_[gpu];
|
||||
vector<cudaStream_t>& gpu_streams = cuda_streams_[gpu];
|
||||
if (gpu_streams.size() <= stream_id) {
|
||||
gpu_streams.resize(stream_id + 1, nullptr);
|
||||
}
|
||||
|
|
@ -75,7 +78,7 @@ class ThreadLocalCUDAObjects {
|
|||
|
||||
cublasHandle_t GetHandle(int gpu, int stream_id) {
|
||||
DeviceGuard guard(gpu);
|
||||
vector<cublasHandle_t> &gpu_handles = cublas_handles_[gpu];
|
||||
vector<cublasHandle_t>& gpu_handles = cublas_handles_[gpu];
|
||||
if (gpu_handles.size() <= stream_id) {
|
||||
gpu_handles.resize(stream_id + 1, nullptr);
|
||||
}
|
||||
|
|
@ -92,6 +95,20 @@ class ThreadLocalCUDAObjects {
|
|||
return gpu_handles[stream_id];
|
||||
}
|
||||
|
||||
cudnnHandle_t GetCudnnHandle(int gpu, int stream_id) {
|
||||
DeviceGuard guard(gpu);
|
||||
vector<cudnnHandle_t>& gpu_handles = cudnn_handles_[gpu];
|
||||
if (gpu_handles.size() <= stream_id) {
|
||||
gpu_handles.resize(stream_id + 1, nullptr);
|
||||
}
|
||||
if (!gpu_handles[stream_id]) {
|
||||
CUDNN_ENFORCE(cudnnCreate(&gpu_handles[stream_id]));
|
||||
CUDNN_ENFORCE(
|
||||
cudnnSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
|
||||
}
|
||||
return gpu_handles[stream_id];
|
||||
}
|
||||
|
||||
~ThreadLocalCUDAObjects() noexcept {
|
||||
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
|
||||
for (auto& handle : cublas_handles_[i]) {
|
||||
|
|
@ -104,10 +121,16 @@ class ThreadLocalCUDAObjects {
|
|||
CUDA_CHECK(cudaStreamDestroy(stream));
|
||||
}
|
||||
}
|
||||
for (auto& handle : cudnn_handles_[i]) {
|
||||
if (handle) {
|
||||
CUDNN_CHECK(cudnnDestroy(handle));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
vector<cudaStream_t> cuda_streams_[CAFFE2_COMPILE_TIME_MAX_GPUS];
|
||||
vector<cublasHandle_t> cublas_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
|
||||
vector<cudnnHandle_t> cudnn_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
|
||||
};
|
||||
|
||||
class CUDAContext final {
|
||||
|
|
@ -166,6 +189,10 @@ class CUDAContext final {
|
|||
return cuda_objects_.GetHandle(gpu_id_, stream_id_);
|
||||
}
|
||||
|
||||
cudnnHandle_t cudnn_handle() {
|
||||
return cuda_objects_.GetCudnnHandle(gpu_id_, stream_id_);
|
||||
}
|
||||
|
||||
curandGenerator_t& curand_generator() {
|
||||
if (!curand_generator_) {
|
||||
DeviceGuard guard(gpu_id_);
|
||||
|
|
|
|||
161
caffe2/core/cudnn_wrappers.h
Normal file
161
caffe2/core/cudnn_wrappers.h
Normal file
|
|
@ -0,0 +1,161 @@
|
|||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#ifndef CAFFE2_CORE_CUDNN_WRAPPERS_H_
|
||||
#define CAFFE2_CORE_CUDNN_WRAPPERS_H_
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class CuDNNWrapper;
|
||||
|
||||
/**
|
||||
* CuDNNWorkspace is a wrapper around a raw cuda pointer that holds the cudnn
|
||||
* scratch space. This struct is meant to be only used in CuDNNWrapper to
|
||||
* provide a program-wide scratch space for CuDNN. The reason behind it is that
|
||||
* cudnn function calls are usually very efficient, hence one probably does not
|
||||
* want to run multiple cudnn calls at the same time. As a result, one should
|
||||
* not need more than one cudnn workspace per device.
|
||||
*/
|
||||
struct CuDNNWorkspace {
|
||||
~CuDNNWorkspace() noexcept {}
|
||||
|
||||
void* get(size_t nbytes) {
|
||||
if (nbytes_ < nbytes) {
|
||||
reset();
|
||||
auto data_and_deleter = CUDAContext::New(nbytes);
|
||||
data_ = {data_and_deleter.first, data_and_deleter.second};
|
||||
nbytes_ = nbytes;
|
||||
}
|
||||
CAFFE_ENFORCE_GE(nbytes_, nbytes);
|
||||
return data_.get();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
data_ = nullptr;
|
||||
nbytes_ = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<void, MemoryDeleter> data_{nullptr, NoDelete};
|
||||
size_t nbytes_{0};
|
||||
};
|
||||
|
||||
// CuDNNState is the owner of the CuDNNWorkspace, and serializes all
|
||||
// executions of operations that use the state onto it's own stream
|
||||
// (so multiple Net workers can reuse the same workspace from
|
||||
// different threads and CUDA streams).
|
||||
class CuDNNState {
|
||||
public:
|
||||
explicit CuDNNState(size_t gpu_id) : gpu_id_(gpu_id) {
|
||||
DeviceGuard g(gpu_id_);
|
||||
CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
|
||||
CUDA_ENFORCE(cudaEventCreate(&before_));
|
||||
CUDA_ENFORCE(cudaEventCreate(&after_));
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream_));
|
||||
CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, stream_));
|
||||
}
|
||||
|
||||
~CuDNNState() noexcept {
|
||||
DeviceGuard g(gpu_id_);
|
||||
CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
|
||||
CUDA_CHECK(cudaStreamDestroy(stream_));
|
||||
CUDA_CHECK(cudaEventDestroy(after_));
|
||||
CUDA_CHECK(cudaEventDestroy(before_));
|
||||
}
|
||||
|
||||
cudnnHandle_t& cudnn_handle() {
|
||||
return cudnn_handle_;
|
||||
}
|
||||
|
||||
CuDNNWorkspace& workspace() {
|
||||
return workspace_;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void execute(cudaStream_t stream, F&& f) {
|
||||
CUDA_ENFORCE(cudaEventRecord(before_, stream));
|
||||
CUDA_ENFORCE(cudaStreamWaitEvent(stream_, before_, 0));
|
||||
f(this);
|
||||
CUDA_ENFORCE(cudaEventRecord(after_, stream_));
|
||||
CUDA_ENFORCE(cudaStreamWaitEvent(stream, after_, 0));
|
||||
}
|
||||
|
||||
private:
|
||||
cudnnHandle_t cudnn_handle_{nullptr};
|
||||
cudaEvent_t before_{nullptr};
|
||||
cudaEvent_t after_{nullptr};
|
||||
cudaStream_t stream_{nullptr};
|
||||
CuDNNWorkspace workspace_;
|
||||
size_t gpu_id_{0};
|
||||
DISABLE_COPY_AND_ASSIGN(CuDNNState);
|
||||
};
|
||||
|
||||
/**
|
||||
* CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
|
||||
*
|
||||
* The wrapper ensures that for each thread and each gpu, there is one
|
||||
* identical cudnn handle, which is also associated with the thread-local
|
||||
* per-device cuda stream. The wrapper also hosts the device-specific cudnn
|
||||
* workspace (scratch space for some cudnn functions).
|
||||
*
|
||||
*/
|
||||
class CuDNNWrapper {
|
||||
public:
|
||||
/**
|
||||
* Creates a cudnn wrapper associated with a CUDAContext object. Note that
|
||||
* the CUDAContext object should outlive the CuDNNWrapper.
|
||||
*/
|
||||
explicit CuDNNWrapper(CUDAContext* context) : context_(context) {}
|
||||
|
||||
/**
|
||||
* Returns the inline cudnn handle that executes on the current
|
||||
* thread's cuda_stream.
|
||||
*/
|
||||
cudnnHandle_t inline_cudnn_handle() {
|
||||
return context_->cudnn_handle();
|
||||
}
|
||||
|
||||
// Executes the closure F on the CuDNNState associated with state_idx
|
||||
template <typename F>
|
||||
void with_cudnn_state(size_t state_idx, F&& f) {
|
||||
CAFFE_ENFORCE(
|
||||
state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
|
||||
auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
|
||||
|
||||
DeviceGuard dg(context_->cuda_gpu_id());
|
||||
|
||||
// We need to serialize execution on the CuDNNState as we can't
|
||||
// allow multiple threads to race through the cudaEventRecord
|
||||
// calls (so a worker thread might wait on another worker thread's
|
||||
// execution)
|
||||
std::lock_guard<std::mutex> g(sync_state.mutex);
|
||||
if (!sync_state.state.get()) {
|
||||
sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
|
||||
}
|
||||
CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
|
||||
}
|
||||
|
||||
protected:
|
||||
// Pointer to an external cuda context that the cudnn wrapper will use.
|
||||
CUDAContext* context_;
|
||||
|
||||
static constexpr size_t CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES = 4;
|
||||
|
||||
struct SyncedCuDNNState {
|
||||
std::mutex mutex;
|
||||
std::unique_ptr<CuDNNState> state;
|
||||
};
|
||||
|
||||
using PerGPUCuDNNStates = std::array<
|
||||
std::array<SyncedCuDNNState, CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES>,
|
||||
CAFFE2_COMPILE_TIME_MAX_GPUS>;
|
||||
static PerGPUCuDNNStates& cudnn_states();
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
|
||||
};
|
||||
|
||||
}; // namespace caffe2
|
||||
|
||||
#endif
|
||||
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/operators/conv_op.h"
|
||||
#include "caffe2/operators/conv_op_cache_cudnn.h"
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/operators/conv_op_cache_cudnn.h"
|
||||
#include "caffe2/operators/conv_transpose_op.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/types.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/types.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
|
||||
#include <cub/cub.cuh>
|
||||
|
|
|
|||
|
|
@ -17,9 +17,9 @@
|
|||
#ifndef CAFFE2_OPERATORS_RECURRENT_OP_CUDNN_H_
|
||||
#define CAFFE2_OPERATORS_RECURRENT_OP_CUDNN_H_
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/types.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/operators/softmax_op.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -16,8 +16,8 @@
|
|||
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/operators/spatial_batch_norm_op.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/common_cudnn.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/cudnn_wrappers.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/operators/transpose_op.h"
|
||||
#include "caffe2/operators/transpose_op_gpu.h"
|
||||
|
|
|
|||
Loading…
Reference in a new issue