Add Algorithm Search for ConvGrad (#8613)

* algo search for conv grad

* global cache, bigger workspace size

* fix build error

* refactor

* refactor

* resolve comments

* fix rocm

* change lock places

* rename variable

* remove setting for inference

* resolve comments
This commit is contained in:
Vincent Wang 2021-09-03 11:25:17 +08:00 committed by GitHub
parent 91f05f387a
commit c343f7cb43
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 898 additions and 226 deletions

View file

@ -130,14 +130,16 @@ class IAllocator {
Create a std::unique_ptr that is allocated and freed by the provided IAllocator.
@param allocator The allocator.
@param count_or_bytes The exact bytes to allocate if T is void, otherwise the number of elements to allocate.
@param use_reserve If true, call Reserve() instead of Alloc() to allocate memory.
@returns std::unique_ptr with allocated memory and deleter.
*/
template <typename T>
static IAllocatorUniquePtr<T> MakeUniquePtr(std::shared_ptr<IAllocator> allocator, size_t count_or_bytes) {
static IAllocatorUniquePtr<T> MakeUniquePtr(std::shared_ptr<IAllocator> allocator, size_t count_or_bytes,
bool use_reserve = false) {
if (allocator == nullptr) return nullptr;
// for now limit to fundamental types. we could support others, but to do so either we or the caller
// needs to call the dtor for the objects, for buffers allocated on device we don't have destructor
//static_assert(std::is_fundamental<T>::value, "Fundamental type required as no destructors are called.");
// static_assert(std::is_fundamental<T>::value, "Fundamental type required as no destructors are called.");
size_t alloc_size = count_or_bytes;
@ -145,14 +147,15 @@ class IAllocator {
if (!std::is_void<T>::value) {
// sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't
// reachable if T is void. use std::conditional to 'use' void* in the sizeof call
if (!CalcMemSizeForArray(count_or_bytes,
sizeof(typename std::conditional<std::is_void<T>::value, void*, T>::type),
&alloc_size)) return nullptr;
if (!CalcMemSizeForArray(
count_or_bytes, sizeof(typename std::conditional<std::is_void<T>::value, void*, T>::type), &alloc_size)) {
return nullptr;
}
}
return IAllocatorUniquePtr<T>{
static_cast<T*>(allocator->Alloc(alloc_size)), // allocate
[=](T* ptr) { // capture 'allocator' by value so it's always valid
static_cast<T*>(use_reserve ? allocator->Reserve(alloc_size) : allocator->Alloc(alloc_size)), // allocate
[=](T* ptr) { // capture 'allocator' by value so it's always valid
allocator->Free(ptr);
}};
}

View file

@ -77,6 +77,21 @@ void* CUDAExternalAllocator::Alloc(size_t size) {
void CUDAExternalAllocator::Free(void* p) {
free_(p);
std::lock_guard<OrtMutex> lock(lock_);
auto it = reserved_.find(p);
if (it != reserved_.end()) {
reserved_.erase(it);
if (empty_cache_) empty_cache_();
}
}
void* CUDAExternalAllocator::Reserve(size_t size) {
void* p = Alloc(size);
if (!p) return nullptr;
std::lock_guard<OrtMutex> lock(lock_);
ORT_ENFORCE(reserved_.find(p) == reserved_.end());
reserved_.insert(p);
return p;
}
FencePtr CUDAAllocator::CreateFence(const SessionState* session_state) {

View file

@ -3,7 +3,9 @@
#pragma once
#include <unordered_set>
#include "core/framework/allocator.h"
#include "core/platform/ort_mutex.h"
namespace onnxruntime {
@ -26,20 +28,26 @@ class CUDAAllocator : public IAllocator {
class CUDAExternalAllocator : public CUDAAllocator {
typedef void* (*ExternalAlloc)(size_t size);
typedef void (*ExternalFree)(void* p);
typedef void (*ExternalEmptyCache)();
public:
CUDAExternalAllocator(OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free)
CUDAExternalAllocator(OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free, void* empty_cache)
: CUDAAllocator(device_id, name) {
alloc_ = reinterpret_cast<ExternalAlloc>(alloc);
free_ = reinterpret_cast<ExternalFree>(free);
empty_cache_ = reinterpret_cast<ExternalEmptyCache>(empty_cache);
}
void* Alloc(size_t size) override;
void Free(void* p) override;
void* Reserve(size_t size) override;
private:
mutable OrtMutex lock_;
ExternalAlloc alloc_;
ExternalFree free_;
ExternalEmptyCache empty_cache_;
std::unordered_set<void*> reserved_;
};
//TODO: add a default constructor

View file

@ -98,7 +98,7 @@ AllocatorPtr CUDAExecutionProvider::CreateCudaAllocator(OrtDevice::DeviceId devi
if (external_allocator_info.UseExternalAllocator()) {
AllocatorCreationInfo default_memory_info(
[external_allocator_info](OrtDevice::DeviceId id) {
return std::make_unique<CUDAExternalAllocator>(id, CUDA, external_allocator_info.alloc, external_allocator_info.free);
return std::make_unique<CUDAExternalAllocator>(id, CUDA, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache);
},
device_id,
false);

View file

@ -63,6 +63,14 @@ class CUDAExecutionProvider : public IExecutionProvider {
return IAllocator::MakeUniquePtr<T>(GetAllocator(info_.device_id, OrtMemTypeDefault), count_or_bytes);
}
template <typename T>
IAllocatorUniquePtr<T> GetTransientScratchBuffer(size_t count_or_bytes) const {
if (count_or_bytes == 0)
return nullptr;
return IAllocator::MakeUniquePtr<T>(GetAllocator(info_.device_id, OrtMemTypeDefault), count_or_bytes, true);
}
std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
@ -74,6 +82,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
const cudaDeviceProp& GetDeviceProp() const { return device_prop_; };
int GetCudnnConvAlgo() const { return info_.cudnn_conv_algo_search; }
bool DoCopyOnDefaultStream() const { return info_.do_copy_in_default_stream; }
bool GetCudnnConvUseMaxWorkspace() const { return info_.cudnn_conv_use_max_workspace; }
ProviderOptions GetProviderOptions() const override {
return CUDAExecutionProviderInfo::ToProviderOptions(info_);

View file

@ -19,6 +19,8 @@ constexpr const char* kCudnnConvAlgoSearch = "cudnn_conv_algo_search";
constexpr const char* kDoCopyInDefaultStream = "do_copy_in_default_stream";
constexpr const char* kGpuExternalAlloc = "gpu_external_alloc";
constexpr const char* kGpuExternalFree = "gpu_external_free";
constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache";
constexpr const char* kCudnnConvUseMaxWorkspace = "cudnn_conv_use_max_workspace";
} // namespace provider_option_names
} // namespace cuda
@ -39,6 +41,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
CUDAExecutionProviderInfo info{};
void* alloc = nullptr;
void* free = nullptr;
void* empty_cache = nullptr;
ORT_THROW_IF_ERROR(
ProviderOptionsParser{}
.AddValueParser(
@ -71,6 +74,14 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
free = reinterpret_cast<void*>(address);
return Status::OK();
})
.AddValueParser(
cuda::provider_option_names::kGpuExternalEmptyCache,
[&empty_cache](const std::string& value_str) -> Status {
size_t address;
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
empty_cache = reinterpret_cast<void*>(address);
return Status::OK();
})
.AddAssignmentToReference(cuda::provider_option_names::kMemLimit, info.gpu_mem_limit)
.AddAssignmentToEnumReference(
cuda::provider_option_names::kArenaExtendStrategy,
@ -79,9 +90,10 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
cuda::provider_option_names::kCudnnConvAlgoSearch,
*ort_cudnn_conv_algo_search_mapping, info.cudnn_conv_algo_search)
.AddAssignmentToReference(cuda::provider_option_names::kDoCopyInDefaultStream, info.do_copy_in_default_stream)
.AddAssignmentToReference(cuda::provider_option_names::kCudnnConvUseMaxWorkspace, info.cudnn_conv_use_max_workspace)
.Parse(options));
CUDAExecutionProviderExternalAllocatorInfo alloc_info{alloc, free};
CUDAExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
info.external_allocator_info = alloc_info;
return info;
}
@ -92,11 +104,13 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
{cuda::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)},
{cuda::provider_option_names::kGpuExternalAlloc, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.alloc))},
{cuda::provider_option_names::kGpuExternalFree, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.free))},
{cuda::provider_option_names::kGpuExternalEmptyCache, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.empty_cache))},
{cuda::provider_option_names::kArenaExtendStrategy,
EnumToName(*arena_extend_strategy_mapping, info.arena_extend_strategy)},
{cuda::provider_option_names::kCudnnConvAlgoSearch,
EnumToName(*ort_cudnn_conv_algo_search_mapping, info.cudnn_conv_algo_search)},
{cuda::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)},
{cuda::provider_option_names::kCudnnConvUseMaxWorkspace, MakeStringWithClassicLocale(info.cudnn_conv_use_max_workspace)},
};
return options;

View file

@ -15,15 +15,18 @@ namespace onnxruntime {
struct CUDAExecutionProviderExternalAllocatorInfo {
void* alloc{nullptr};
void* free{nullptr};
void* empty_cache{nullptr};
CUDAExecutionProviderExternalAllocatorInfo() {
alloc = nullptr;
free = nullptr;
empty_cache = nullptr;
}
CUDAExecutionProviderExternalAllocatorInfo(void* a, void* f) {
CUDAExecutionProviderExternalAllocatorInfo(void* a, void* f, void* e) {
alloc = a;
free = f;
empty_cache = e;
}
bool UseExternalAllocator() const {
@ -45,6 +48,9 @@ struct CUDAExecutionProviderInfo {
// arena config.
OrtArenaCfg* default_memory_arena_cfg{nullptr};
CUDAExecutionProviderExternalAllocatorInfo external_allocator_info{};
// By default use fix workspace size (32M) for Conv algo search, the final algo might not be the best.
// If set to true, try to use as much as possible memory for algo search.
bool cudnn_conv_use_max_workspace{false};
static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info);

View file

@ -52,6 +52,15 @@ class CudaKernel : public OpKernel {
return provider_->GetScratchBuffer<T>(count_or_bytes);
}
// Different from GetScratchBuffer which use IAllocator::Alloc() to allocate memory,
// this GetTransientScratchBuffer will call IAllocator::Reserve() to allocate memory.
// IAllocator::Reserve() optionally implement some allocation logic that by-passes any arena-based
// logic (or similar for different allocator) that may be housed in the Alloc() implementation.
template <typename T>
inline IAllocatorUniquePtr<T> GetTransientScratchBuffer(size_t count_or_bytes) const {
return provider_->GetTransientScratchBuffer<T>(count_or_bytes);
}
inline void AddDeferredReleaseCPUPtr(void* p) const {
provider_->AddDeferredReleaseCPUPtr(p);
}

View file

@ -33,6 +33,41 @@ REGISTER_KERNEL_TYPED(float)
REGISTER_KERNEL_TYPED(double)
REGISTER_KERNEL_TYPED(MLFloat16)
template <typename T>
const cudnnConvolutionFwdAlgo_t Conv<T>::kAllAlgos[] = {
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
};
cudnnStatus_t GetWorkspaceSize(const CudnnConvState<cudnnConvolutionFwdAlgoPerf_t>& s, cudnnConvolutionFwdAlgo_t algo,
size_t* sz) {
return cudnnGetConvolutionForwardWorkspaceSize(s.handle, s.x_tensor, s.w_desc, s.conv_desc, s.y_tensor, algo, sz);
}
size_t GetMaxWorkspaceSize(const CudnnConvState<cudnnConvolutionFwdAlgoPerf_t>& s,
const cudnnConvolutionFwdAlgo_t* algo, int n_algo) {
// TODO: get maximum available size from memory areana
size_t free, total;
CUDA_CALL_THROW(cudaMemGetInfo(&free, &total));
// Assuming 10% of fragmentation
free = static_cast<size_t>(static_cast<double>(free) * 0.9);
size_t max_ws_size = 0;
for (int i = 0; i < n_algo; i++) {
cudnnStatus_t err;
size_t sz;
err = GetWorkspaceSize(s, algo[i], &sz);
if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > free) continue;
max_ws_size = sz;
}
return max_ws_size;
}
Status SliceOutUnwantedOutputSection(cudaStream_t stream,
const void* input_data, const std::vector<int64_t>& input_dims,
void* output_data,
@ -213,9 +248,14 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
ORT_ENFORCE(cudnn_conv_algo > -1 && cudnn_conv_algo < 3, "cudnn_conv_algo should be 0, 1 or 2, but got ", cudnn_conv_algo);
switch (cudnn_conv_algo) {
case 0: {
IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize);
static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
size_t max_ws_size = cuda_ep->GetCudnnConvUseMaxWorkspace() ? GetMaxWorkspaceSize(s_, kAllAlgos, num_algos)
: AlgoSearchWorkspaceSize;
// Use GetTransientScratchBuffer() so the workspace can be freed instead of cached.
// Because the benchmarking uses a huge amount of memory, e.g. a few GBs.
IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionForwardAlgorithmEx(
CudnnHandle(),
s_.handle,
s_.x_tensor,
s_.x_data,
s_.w_desc,
@ -227,12 +267,12 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
&algo_count, // returnedAlgoCount
&perf,
algo_search_workspace.get(),
AlgoSearchWorkspaceSize));
max_ws_size));
break;
}
case 1:
CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7(
CudnnHandle(),
s_.handle,
s_.x_tensor,
s_.w_desc,
s_.conv_desc,
@ -244,14 +284,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
default:
perf.algo = kDefaultConvAlgo;
CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
CudnnHandle(),
s_.x_tensor,
s_.w_desc,
s_.conv_desc,
s_.y_tensor,
perf.algo,
&perf.memory));
CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(s_, perf.algo, &perf.memory));
if (std::is_same<T, MLFloat16>::value) {
perf.mathType = CUDNN_TENSOR_OP_MATH;
} else {
@ -290,7 +323,7 @@ Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
const auto alpha = Consts<CudaT>::One;
const auto beta = Consts<CudaT>::Zero;
IAllocatorUniquePtr<void> workspace = GetWorkSpace();
CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(CudnnHandle(),
CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(s_.handle,
&alpha,
s_.x_tensor,
s_.x_data,
@ -304,7 +337,7 @@ Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
s_.y_tensor,
s_.y_data));
if (nullptr != s_.b_data) {
CUDNN_RETURN_IF_ERROR(cudnnAddTensor(CudnnHandle(), &alpha, s_.b_tensor, s_.b_data,
CUDNN_RETURN_IF_ERROR(cudnnAddTensor(s_.handle, &alpha, s_.b_tensor, s_.b_data,
&alpha, s_.y_tensor, s_.y_data));
}
// To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions

View file

@ -111,6 +111,8 @@ constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
template <typename AlgoPerfType>
struct CudnnConvState {
cudnnHandle_t handle;
// if x/w dims changed, update algo and cudnnTensors
std::vector<int64_t> last_x_dims;
std::vector<int64_t> last_w_dims;
@ -173,6 +175,7 @@ class Conv : public CudaKernel {
Conv(const OpKernelInfo& info) : CudaKernel(info), conv_attrs_(info) {
auto pads_size = conv_attrs_.pads.size();
ORT_ENFORCE(pads_size % 2 == 0);
s_.handle = CudnnHandle();
}
Status ComputeInternal(OpKernelContext* context) const override;
@ -186,6 +189,7 @@ class Conv : public CudaKernel {
ConvAttributes conv_attrs_;
mutable CudnnConvState<cudnnConvolutionFwdAlgoPerf_t> s_;
constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
static const cudnnConvolutionFwdAlgo_t kAllAlgos[];
};
Status SliceOutUnwantedOutputSection(cudaStream_t stream,

View file

@ -66,6 +66,21 @@ void* ROCMExternalAllocator::Alloc(size_t size) {
void ROCMExternalAllocator::Free(void* p) {
free_(p);
std::lock_guard<OrtMutex> lock(lock_);
auto it = reserved_.find(p);
if (it != reserved_.end()) {
reserved_.erase(it);
if (empty_cache_) empty_cache_();
}
}
void* ROCMExternalAllocator::Reserve(size_t size) {
void* p = Alloc(size);
if (!p) return nullptr;
std::lock_guard<OrtMutex> lock(lock_);
ORT_ENFORCE(reserved_.find(p) == reserved_.end());
reserved_.insert(p);
return p;
}
void* ROCMPinnedAllocator::Alloc(size_t size) {

View file

@ -3,7 +3,9 @@
#pragma once
#include <unordered_set>
#include "core/framework/allocator.h"
#include "core/platform/ort_mutex.h"
namespace onnxruntime {
@ -25,20 +27,26 @@ class ROCMAllocator : public IAllocator {
class ROCMExternalAllocator : public ROCMAllocator {
typedef void* (*ExternalAlloc)(size_t size);
typedef void (*ExternalFree)(void* p);
typedef void (*ExternalEmptyCache)();
public:
ROCMExternalAllocator(OrtDevice::DeviceId device_id, const char* name, const void* alloc, const void* free)
ROCMExternalAllocator(OrtDevice::DeviceId device_id, const char* name, const void* alloc, const void* free, void* empty_cache)
: ROCMAllocator(device_id, name) {
alloc_ = reinterpret_cast<ExternalAlloc>(const_cast<void*>(alloc));
free_ = reinterpret_cast<ExternalFree>(const_cast<void*>(free));
empty_cache_ = reinterpret_cast<ExternalEmptyCache>(empty_cache);
}
void* Alloc(size_t size) override;
void Free(void* p) override;
void* Reserve(size_t size) override;
private:
mutable OrtMutex lock_;
ExternalAlloc alloc_;
ExternalFree free_;
ExternalEmptyCache empty_cache_;
std::unordered_set<void*> reserved_;
};
//TODO: add a default constructor

View file

@ -63,7 +63,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi
if (external_allocator_info.UseExternalAllocator()) {
AllocatorCreationInfo default_memory_info(
[external_allocator_info](OrtDevice::DeviceId id) {
return std::make_unique<ROCMExternalAllocator>(id, CUDA, external_allocator_info.alloc, external_allocator_info.free);
return std::make_unique<ROCMExternalAllocator>(id, CUDA, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache);
},
device_id,
false);

View file

@ -16,6 +16,7 @@ constexpr const char* kArenaExtendStrategy = "arena_extend_strategy";
constexpr const char* kConvExhaustiveSearch = "conv_exhaustive_search";
constexpr const char* kGpuExternalAlloc = "gpu_external_alloc";
constexpr const char* kGpuExternalFree = "gpu_external_free";
constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache";
} // namespace provider_option_names
} // namespace rocm
@ -30,7 +31,7 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
ROCMExecutionProviderInfo info{};
void* alloc = nullptr;
void* free = nullptr;
void* empty_cache = nullptr;
ORT_THROW_IF_ERROR(
ProviderOptionsParser{}
.AddValueParser(
@ -49,7 +50,15 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
free = reinterpret_cast<void*>(address);
return Status::OK();
})
.AddValueParser(
.AddValueParser(
rocm::provider_option_names::kGpuExternalEmptyCache,
[&empty_cache](const std::string& value_str) -> Status {
size_t address;
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
empty_cache = reinterpret_cast<void*>(address);
return Status::OK();
})
.AddValueParser(
rocm::provider_option_names::kDeviceId,
[&info](const std::string& value_str) -> Status {
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.device_id));
@ -70,7 +79,7 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
arena_extend_strategy_mapping, info.arena_extend_strategy)
.Parse(options));
ROCMExecutionProviderExternalAllocatorInfo alloc_info{alloc, free};
ROCMExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
info.external_allocator_info = alloc_info;
return info;
@ -82,6 +91,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution
{rocm::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)},
{rocm::provider_option_names::kGpuExternalAlloc, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.alloc))},
{rocm::provider_option_names::kGpuExternalFree, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.free))},
{rocm::provider_option_names::kGpuExternalEmptyCache, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.empty_cache))},
{rocm::provider_option_names::kConvExhaustiveSearch, MakeStringWithClassicLocale(info.miopen_conv_exhaustive_search)},
{rocm::provider_option_names::kArenaExtendStrategy,
EnumToName(arena_extend_strategy_mapping, info.arena_extend_strategy)},

View file

@ -14,15 +14,18 @@ namespace onnxruntime {
struct ROCMExecutionProviderExternalAllocatorInfo {
const void* alloc{nullptr};
const void* free{nullptr};
void* empty_cache{nullptr};
ROCMExecutionProviderExternalAllocatorInfo() {
alloc = nullptr;
free = nullptr;
empty_cache = nullptr;
}
ROCMExecutionProviderExternalAllocatorInfo(void* a, void* f) {
ROCMExecutionProviderExternalAllocatorInfo(void* a, void* f, void* e) {
alloc = a;
free = f;
empty_cache = e;
}
bool UseExternalAllocator() const {

View file

@ -184,10 +184,12 @@ class TestInferenceSession(unittest.TestCase):
option['gpu_external_alloc'] = '0'
option['gpu_external_free'] = '0'
option['gpu_external_empty_cache'] = '0'
sess.set_providers(['CUDAExecutionProvider'], [option])
options = sess.get_provider_options()
self.assertEqual(options['CUDAExecutionProvider']['gpu_external_alloc'], '0')
self.assertEqual(options['CUDAExecutionProvider']['gpu_external_free'], '0')
self.assertEqual(options['CUDAExecutionProvider']['gpu_external_empty_cache'], '0')
#
# Note: Tests that throw an exception leave an empty session due to how set_providers currently works,
# so run them last. Each set_providers call will attempt to re-create a session, so it's

View file

@ -164,6 +164,7 @@ class GraphExecutionManager(GraphExecutionInterface):
from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_gpu_allocator
self._torch_alloc = torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address()
self._torch_free = torch_gpu_allocator.gpu_caching_allocator_raw_delete_address()
self._torch_empty_cache = torch_gpu_allocator.gpu_caching_allocator_empty_cache_address()
def _validate_module_type(self, module):
"""Raises ORTModuleTorchModelException if the module is not a torch.nn.Module"""
@ -230,12 +231,16 @@ class GraphExecutionManager(GraphExecutionInterface):
providers = (["ROCMExecutionProvider"] if self.is_rocm_pytorch else [
"CUDAExecutionProvider"])
providers.append("CPUExecutionProvider")
provider_option_map = {"device_id": str(self._device.index)}
if not self.is_rocm_pytorch:
# Set Conv algo search mode to HEURISTIC, which is same as PyTorch's default setting.
provider_option_map["cudnn_conv_algo_search"] = "HEURISTIC"
provider_option_map["cudnn_conv_use_max_workspace"] = "1"
if self._use_external_gpu_allocator:
provider_options = [{"device_id": str(self._device.index),
"gpu_external_alloc": str(self._torch_alloc),
"gpu_external_free": str(self._torch_free)}, {}]
else:
provider_options = [{"device_id": str(self._device.index)}, {}]
provider_option_map["gpu_external_alloc"] = str(self._torch_alloc)
provider_option_map["gpu_external_free"] = str(self._torch_free)
provider_option_map["gpu_external_empty_cache"] = str(self._torch_empty_cache)
provider_options = [provider_option_map, {}]
elif self._device.type == 'cpu':
providers = ["CPUExecutionProvider"]
provider_options = [{}]

View file

@ -5,14 +5,22 @@
#include <c10/___gpu_identifier___/___gpu_allocator_header___.h>
size_t gpu_caching_allocator_raw_alloc_address() {
return reinterpret_cast<size_t>(&c10::___gpu_identifier___::___gpu_allocator_header___::raw_alloc);
return reinterpret_cast<size_t>(&c10::___gpu_identifier___::___gpu_allocator_header___::raw_alloc);
}
size_t gpu_caching_allocator_raw_delete_address() {
return reinterpret_cast<size_t>(&c10::___gpu_identifier___::___gpu_allocator_header___::raw_delete);
return reinterpret_cast<size_t>(&c10::___gpu_identifier___::___gpu_allocator_header___::raw_delete);
}
size_t gpu_caching_allocator_empty_cache_address() {
return reinterpret_cast<size_t>(&c10::___gpu_identifier___::___gpu_allocator_header___::emptyCache);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("gpu_caching_allocator_raw_alloc_address", &gpu_caching_allocator_raw_alloc_address, "Address of PyTorch GPU allocator");
m.def("gpu_caching_allocator_raw_delete_address", &gpu_caching_allocator_raw_delete_address, "Address of PyTorch GPU deallocator");
m.def("gpu_caching_allocator_raw_alloc_address", &gpu_caching_allocator_raw_alloc_address,
"Address of PyTorch GPU allocator");
m.def("gpu_caching_allocator_raw_delete_address", &gpu_caching_allocator_raw_delete_address,
"Address of PyTorch GPU deallocator");
m.def("gpu_caching_allocator_empty_cache_address", &gpu_caching_allocator_empty_cache_address,
"Address of PyTorch GPU empty cache");
}

View file

@ -639,17 +639,20 @@ def test_gradient_correctness_conv1d(use_fp16, input_requires_grad):
loss.backward()
return prediction
for step in range(10):
for _ in range(10):
x = torch.randn(N, seq_len, C_in, device=device, requires_grad=input_requires_grad)
pt_prediction = run_step(pt_model, x)
ort_prediction = run_step(ort_model, x)
# PyTorch's Conv/GonvGrad uses HEURISTIC mode to search algo while ORT uses EXHAUSTIVE mode by default.
# While different algo types generate slightly different results, especially for FP16,
# so relax the tolerance for comparison, especially for FP16 run and gradient comparison.
if use_fp16:
_test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-3, rtol=1e-3)
_test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model, rtol=1e-2, atol=2e-2)
_test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model, rtol=5e-1, atol=4e-1)
else:
_test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-5)
_test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model, rtol=5e-3, atol=4e-3)
_test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model, rtol=5e-2, atol=4e-2)
@pytest.mark.parametrize("device", ['cuda', 'cpu'])
@pytest.mark.parametrize("padding_idx", [None, 1])

View file

@ -0,0 +1,322 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "gtest/gtest.h"
#include "test/providers/provider_test_utils.h"
namespace onnxruntime {
namespace cuda {
namespace test {
using namespace std;
using namespace onnxruntime::test;
#ifdef USE_CUDA
namespace {
struct ConvGradOpAttributes {
vector<int64_t> dilations;
int64_t group;
vector<int64_t> kernel_shape;
vector<int64_t> pads;
vector<int64_t> strides;
};
void TestConvGradOp(const ConvGradOpAttributes& attributes, const vector<vector<float>>& inputs,
const vector<vector<int64_t>>& input_shapes, const vector<vector<float>>& outputs,
const vector<vector<int64_t>>& output_shapes, bool is_half = false) {
OpTester test("ConvGrad", 1, kMSDomain);
test.AddAttribute("group", attributes.group);
test.AddAttribute("kernel_shape", attributes.kernel_shape);
test.AddAttribute("pads", attributes.pads);
if (!attributes.dilations.empty()) {
test.AddAttribute("dilations", attributes.dilations);
}
if (!attributes.strides.empty()) {
test.AddAttribute("strides", attributes.strides);
}
if (is_half) {
std::vector<MLFloat16> dY_half(inputs[0].size());
ConvertFloatToMLFloat16(inputs[0].data(), dY_half.data(), static_cast<int>(inputs[0].size()));
test.AddInput<MLFloat16>("dY", input_shapes[0], dY_half);
std::vector<MLFloat16> X_half(inputs[1].size());
ConvertFloatToMLFloat16(inputs[1].data(), X_half.data(), static_cast<int>(inputs[1].size()));
test.AddInput<MLFloat16>("X", input_shapes[1], X_half);
std::vector<MLFloat16> W_half(inputs[2].size());
ConvertFloatToMLFloat16(inputs[2].data(), W_half.data(), static_cast<int>(inputs[2].size()));
test.AddInput<MLFloat16>("W", input_shapes[2], W_half);
std::vector<MLFloat16> dX_half(outputs[0].size());
ConvertFloatToMLFloat16(outputs[0].data(), dX_half.data(), static_cast<int>(outputs[0].size()));
test.AddOutput<MLFloat16>("dX", output_shapes[0], dX_half);
std::vector<MLFloat16> dW_half(outputs[1].size());
ConvertFloatToMLFloat16(outputs[1].data(), dW_half.data(), static_cast<int>(outputs[1].size()));
test.AddOutput<MLFloat16>("dW", output_shapes[1], dW_half);
if (outputs.size() >= 3) {
std::vector<MLFloat16> dB_half(outputs[2].size());
ConvertFloatToMLFloat16(outputs[2].data(), dB_half.data(), static_cast<int>(outputs[2].size()));
test.AddOutput<MLFloat16>("dB", output_shapes[2], dB_half);
}
} else {
test.AddInput<float>("dY", input_shapes[0], inputs[0]);
test.AddInput<float>("X", input_shapes[1], inputs[1]);
test.AddInput<float>("W", input_shapes[2], inputs[2]);
test.AddOutput<float>("dX", output_shapes[0], outputs[0]);
test.AddOutput<float>("dW", output_shapes[1], outputs[1]);
if (outputs.size() >= 3) {
test.AddOutput<float>("dB", output_shapes[2], outputs[2]);
}
}
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
}
} // namespace
TEST(ConvGradTest, Conv1D_1) {
ConvGradOpAttributes attrs = {
vector<int64_t>{1}, // dilations
1, // group
vector<int64_t>{1}, // kernel_shape
vector<int64_t>{0, 0}, // pads
vector<int64_t>{1}, // strides
};
vector<float> dY(7, 1.0f);
vector<int64_t> dY_shape = {1, 1, 7};
vector<float> X = {2.0349f, -1.8088f, -0.1171f, 1.1849f, -0.6590f, -2.0404f, -1.2810f};
vector<int64_t> X_shape = {1, 1, 7};
vector<float> W = {0.5081f};
vector<int64_t> W_shape = {1, 1, 1};
vector<float> dX = {0.5081f, 0.5081f, 0.5081f, 0.5081f, 0.5081f, 0.5081f, 0.5081f};
vector<int64_t> dX_shape = {1, 1, 7};
vector<float> dW = {-2.6865f};
vector<int64_t> dW_shape = {1, 1, 1};
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW}, {dX_shape, dW_shape});
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW}, {dX_shape, dW_shape}, true);
}
TEST(ConvGradTest, Conv1D_2) {
ConvGradOpAttributes attrs = {
vector<int64_t>{2}, // dilations
1, // group
vector<int64_t>{2}, // kernel_shape
vector<int64_t>{2, 2}, // pads
vector<int64_t>{2}, // strides
};
vector<float> dY(30, 1.0f);
vector<int64_t> dY_shape = {3, 2, 5};
vector<float> X = {-0.9303f, 0.3717f, 0.4961f, 0.5068f, -0.7506f, -0.7609f, -1.8795f, 0.0536f,
1.5201f, -0.9580f, -1.7678f, 0.4683f, -0.3142f, 0.2097f, -1.3819f, -0.1070f,
-1.7558f, -0.0278f, 1.5378f, 2.6415f, 1.0004f, 1.3604f, 1.2819f, -0.4629f};
vector<int64_t> X_shape = {3, 1, 8};
vector<float> W = {1.6664f, -0.1582f, -0.8984f, 0.0849f};
vector<int64_t> W_shape = {2, 1, 2};
vector<float> dX = {0.6948f, 0.0000f, 0.6948f, 0.0000f, 0.6948f, 0.0000f, 0.6948f, 0.0000f,
0.6948f, 0.0000f, 0.6948f, 0.0000f, 0.6948f, 0.0000f, 0.6948f, 0.0000f,
0.6948f, 0.0000f, 0.6948f, 0.0000f, 0.6948f, 0.0000f, 0.6948f, 0.0000f};
vector<int64_t> dX_shape = {3, 1, 8};
vector<float> dW = {-2.9440f, -2.9440f, -2.9440f, -2.9440f};
vector<int64_t> dW_shape = {2, 1, 2};
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW}, {dX_shape, dW_shape});
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW}, {dX_shape, dW_shape}, true);
}
TEST(ConvGradTest, Conv1D_Bias) {
ConvGradOpAttributes attrs = {
vector<int64_t>{2}, // dilations
1, // group
vector<int64_t>{1}, // kernel_shape
vector<int64_t>{1, 1}, // pads
vector<int64_t>{3}, // strides
};
vector<float> dY(8, 1.0f);
vector<int64_t> dY_shape = {2, 1, 4};
vector<float> X = {0.3305f, 2.6170f, -0.8102f, -1.1348f, -0.0850f, 0.2033f, 0.7295f, -0.2826f, -0.5977f,
0.5505f, 0.3895f, -1.3394f, 0.6413f, -0.4744f, -0.9943f, 0.7560f, 0.1355f, -1.3931f,
1.2644f, 0.0240f, 0.7571f, 0.6851f, -0.3362f, -1.1230f, 0.6475f, -0.4596f, 1.1648f,
0.8991f, 0.0440f, 1.5056f, 0.9504f, -0.5266f, 0.0437f, -0.3006f, 0.8489f, 0.0960f};
vector<int64_t> X_shape = {2, 2, 9};
vector<float> W = {0.0398f, 0.1392f};
vector<int64_t> W_shape = {1, 2, 1};
vector<float> dX = {0.0000f, 0.0000f, 0.0398f, 0.0000f, 0.0000f, 0.0398f, 0.0000f, 0.0000f, 0.0398f,
0.0000f, 0.0000f, 0.1392f, 0.0000f, 0.0000f, 0.1392f, 0.0000f, 0.0000f, 0.1392f,
0.0000f, 0.0000f, 0.0398f, 0.0000f, 0.0000f, 0.0398f, 0.0000f, 0.0000f, 0.0398f,
0.0000f, 0.0000f, 0.1392f, 0.0000f, 0.0000f, 0.1392f, 0.0000f, 0.0000f, 0.1392f};
vector<int64_t> dX_shape = {2, 2, 9};
vector<float> dW = {-0.4057f, -2.0815f};
vector<int64_t> dW_shape = {1, 2, 1};
vector<float> dB = {8.f};
vector<int64_t> dB_shape = {1};
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW, dB}, {dX_shape, dW_shape, dB_shape});
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW, dB}, {dX_shape, dW_shape, dB_shape}, true);
}
TEST(ConvGradTest, Conv2D) {
ConvGradOpAttributes attrs = {
vector<int64_t>{1, 1}, // dilations
1, // group
vector<int64_t>{1, 1}, // kernel_shape
vector<int64_t>{0, 0, 0, 0}, // pads
vector<int64_t>{1, 1}, // strides
};
vector<float> dY(48, 1.0f);
vector<int64_t> dY_shape = {1, 3, 4, 4};
vector<float> X = {0.8374f, -2.0758f, 1.8918f, -1.0625f, -1.2747f, -0.1561f, 0.4573f, 2.5314f, -0.0089f, -1.0412f,
0.7690f, 0.2320f, 0.6535f, -0.4921f, -0.6051f, 0.5580f, 1.5682f, -1.0309f, -0.9379f, -0.1834f,
-1.2162f, 1.4167f, -0.2849f, -0.1625f, 0.3380f, -0.1393f, -1.1557f, 0.9718f, -0.4656f, -0.9046f,
1.5710f, -1.3963f, 1.2470f, 0.7327f, 0.8045f, 0.8071f, 1.1703f, -1.3566f, -0.2030f, -0.1227f,
-0.5881f, 2.4159f, -0.2768f, 0.5567f, -0.2805f, 0.1618f, -0.7256f, -0.1053f};
vector<int64_t> X_shape = {1, 3, 4, 4};
vector<float> W = {0.1094f, 1.1541f, 0.0486f, 0.5668f, 1.0372f, -0.3792f, 1.4979f, 0.1757f, 0.1733f};
vector<int64_t> W_shape = {3, 3, 1, 1};
vector<float> dX = {2.1741f, 2.1741f, 2.1741f, 2.1741f, 2.1741f, 2.1741f, 2.1741f, 2.1741f,
2.1741f, 2.1741f, 2.1741f, 2.1741f, 2.1741f, 2.1741f, 2.1741f, 2.1741f,
2.3670f, 2.3670f, 2.3670f, 2.3670f, 2.3670f, 2.3670f, 2.3670f, 2.3670f,
2.3670f, 2.3670f, 2.3670f, 2.3670f, 2.3670f, 2.3670f, 2.3670f, 2.3670f,
-0.1572f, -0.1572f, -0.1572f, -0.1572f, -0.1572f, -0.1572f, -0.1572f, -0.1572f,
-0.1572f, -0.1572f, -0.1572f, -0.1572f, -0.1572f, -0.1572f, -0.1572f, -0.1572f};
vector<int64_t> dX_shape = {1, 3, 4, 4};
vector<float> dW = {1.2142f, -2.0115f, 4.2372f, 1.2142f, -2.0115f, 4.2372f, 1.2142f, -2.0115f, 4.2372f};
vector<int64_t> dW_shape = {3, 3, 1, 1};
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW}, {dX_shape, dW_shape});
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW}, {dX_shape, dW_shape}, true);
}
TEST(ConvGradTest, Conv2D_Bias) {
ConvGradOpAttributes attrs = {
vector<int64_t>{1, 1}, // dilations
1, // group
vector<int64_t>{2, 2}, // kernel_shape
vector<int64_t>{0, 0, 0, 0}, // pads
vector<int64_t>{1, 1}, // strides
};
vector<float> dY(8, 1.0f);
vector<int64_t> dY_shape = {1, 2, 2, 2};
vector<float> X = {-0.4406f, 0.3064f, 0.0794f, -0.2795f, 0.8228f, 0.4751f, 1.3114f, 0.9522f, 0.9082f};
vector<int64_t> X_shape = {1, 1, 3, 3};
vector<float> W = {-0.0820f, -0.4214f, -2.2745f, -1.5834f, 0.9746f, 0.6936f, -0.5140f, 0.9900f};
vector<int64_t> W_shape = {2, 1, 2, 2};
vector<float> dX = {0.8926f, 1.1648f, 0.2723f, -1.8959f, -2.2169f, -0.3211f, -2.7884f, -3.3818f, -0.5933f};
vector<int64_t> dX_shape = {1, 1, 3, 3};
vector<float> dW = {0.4092f, 1.6838f, 2.8069f, 3.1583f, 0.4092f, 1.6838f, 2.8069f, 3.1583f};
vector<int64_t> dW_shape = {2, 1, 2, 2};
vector<float> dB = {4.f, 4.f};
vector<int64_t> dB_shape = {2};
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW, dB}, {dX_shape, dW_shape, dB_shape});
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW, dB}, {dX_shape, dW_shape, dB_shape}, true);
}
TEST(ConvGradTest, Conv3D) {
ConvGradOpAttributes attrs = {
vector<int64_t>{1, 1, 1}, // dilations
1, // group
vector<int64_t>{2, 2, 1}, // kernel_shape
vector<int64_t>{2, 2, 2, 2, 2, 2}, // pads
vector<int64_t>{2, 2, 2}, // strides
};
vector<float> dY(81, 1.0f);
vector<int64_t> dY_shape = {1, 3, 3, 3, 3};
vector<float> X = {0.5598f, -1.9201f, -0.7435f, 2.0217f, 1.2615f, -1.9540f, 0.3119f, 0.0106f, -0.1752f,
0.1553f, 1.3088f, 0.8588f, -0.6396f, -1.1059f, -0.7768f, 0.3251f, -0.3116f, -0.1495f,
0.8923f, -0.1832f, -0.3995f, -0.9641f, 1.9743f, -0.3098f, -0.3029f, -0.3453f, -0.7708f,
-0.3267f, -0.5051f, -0.9330f, -0.2421f, -0.0874f, 0.3225f, -0.8572f, -0.2019f, -0.7069f,
0.4333f, 0.5562f, -1.5587f, -1.0665f, -0.6832f, -0.4320f, -0.0225f, 1.4662f, 0.4808f,
0.0282f, 0.6967f, 0.5708f, -1.3258f, -0.6925f, 0.1217f, 1.3211f, 0.5877f, 0.7335f};
vector<int64_t> X_shape = {1, 3, 3, 3, 2};
vector<float> W = {-0.1911f, 0.6604f, -1.0283f, -0.9381f, -0.3449f, 1.1152f, -1.0256f, -0.3494f, 0.4504f,
0.2418f, 0.2258f, -1.5920f, 1.0468f, 0.2045f, 0.8264f, -0.5797f, -0.0254f, 0.6934f,
-1.7728f, 0.8619f, -0.2013f, -0.1045f, -0.4713f, 1.2544f, 1.7090f, -0.7133f, -0.6160f,
-1.2325f, -1.2152f, 0.0935f, -0.4929f, 1.3772f, 0.3125f, -0.7773f, 1.0350f, 3.2168f};
vector<int64_t> W_shape = {3, 3, 2, 2, 1};
vector<float> dX = {2.5646f, 0.0000f, 0.1516f, 0.0000f, 2.5646f, 0.0000f, -0.8178f, 0.0000f, -2.7503f,
0.0000f, -0.8178f, 0.0000f, 2.5646f, 0.0000f, 0.1516f, 0.0000f, 2.5646f, 0.0000f,
-1.5855f, 0.0000f, 1.9021f, 0.0000f, -1.5855f, 0.0000f, -3.2913f, 0.0000f, 1.8898f,
0.0000f, -3.2913f, 0.0000f, -1.5855f, 0.0000f, 1.9021f, 0.0000f, -1.5855f, 0.0000f,
0.5616f, 0.0000f, -0.6399f, 0.0000f, 0.5616f, 0.0000f, 0.7895f, 0.0000f, 2.8792f,
0.0000f, 0.7895f, 0.0000f, 0.5616f, 0.0000f, -0.6399f, 0.0000f, 0.5616f, 0.0000f};
vector<int64_t> dX_shape = {1, 3, 3, 3, 2};
vector<float> dW = {0.8701f, -1.5203f, 1.6207f, -0.1752f, 2.4225f, -0.0770f, -0.8080f, -0.7708f, -0.9880f,
-1.4370f, 0.6742f, 0.4808f, 0.8701f, -1.5203f, 1.6207f, -0.1752f, 2.4225f, -0.0770f,
-0.8080f, -0.7708f, -0.9880f, -1.4370f, 0.6742f, 0.4808f, 0.8701f, -1.5203f, 1.6207f,
-0.1752f, 2.4225f, -0.0770f, -0.8080f, -0.7708f, -0.9880f, -1.4370f, 0.6742f, 0.4808f};
vector<int64_t> dW_shape = {3, 3, 2, 2, 1};
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW}, {dX_shape, dW_shape});
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW}, {dX_shape, dW_shape}, true);
}
TEST(ConvTest, Conv3D_Bias) {
ConvGradOpAttributes attrs = {
vector<int64_t>{2, 2, 2}, // dilations
1, // group
vector<int64_t>{2, 2, 2}, // kernel_shape
vector<int64_t>{2, 2, 2, 2, 2, 2}, // pads
vector<int64_t>{2, 2, 2}, // strides
};
vector<float> dY(108, 1.0f);
vector<int64_t> dY_shape = {2, 2, 3, 3, 3};
vector<float> X = {0.5189f, -0.8970f, 1.6637f, -0.0882f, 0.0562f, 1.5707f, 0.5424f, 0.4293f, -0.7949f, 2.1479f,
0.8582f, -0.5160f, -0.2406f, 0.0915f, -0.6440f, -0.3800f, 1.7989f, -1.6479f, 0.9071f, -0.4087f,
1.3729f, -1.5582f, 1.9312f, 1.0753f, 0.8313f, 0.5097f, -0.0664f, -1.2774f, 1.7208f, -1.7777f,
-2.0758f, -0.4782f, 2.0207f, 1.4053f, 0.3488f, -0.0871f, 1.4151f, 2.5243f, 0.2891f, 0.8317f,
1.8934f, 0.3911f, 0.2915f, -0.8505f, 1.0430f, 0.5391f, 0.8347f, 0.0633f, -0.3250f, 1.3358f,
-0.3121f, 0.4587f, -0.4955f, 1.8411f, 0.9877f, 1.0809f, 0.0119f, -1.2706f, 1.8457f, -0.1520f,
-0.4535f, -0.5325f, -0.8921f, -0.3127f, 0.5746f, -1.2514f, 0.4638f, 0.8440f, -0.6113f, 0.6936f,
0.0998f, 0.9767f, 0.2785f, -0.3068f, -0.4619f, 0.4801f, -2.1590f, -1.7342f, 0.7354f, 0.0234f,
1.8095f, 0.1252f, -0.5841f, 0.0738f, 1.4252f, 1.4222f, -0.1192f, -2.9955f, 0.8287f, 0.6252f,
-1.5834f, -0.1388f, 0.5532f, 0.4044f, 1.0432f, -2.3991f, 0.4339f, 0.1083f, -0.7726f, 2.0629f,
0.7136f, -0.0978f, -0.7905f, 0.9585f, -0.3205f, 1.3750f, 0.4137f, -0.4552f, 2.7165f, -1.6367f,
-0.6286f, -0.4656f, 0.6219f, -1.7275f, 1.7599f, -1.0443f, 1.3212f, 0.1621f, -0.5357f, 0.0957f,
0.4524f, -0.3814f, -0.0744f, 0.8301f, -0.9539f, 0.0867f, -0.7864f, 1.4918f};
vector<int64_t> X_shape = {2, 1, 4, 4, 4};
vector<float> W = {-0.6570f, 0.1637f, 1.7824f, 0.7986f, -0.2703f, -0.7447f, 1.2674f, 0.2019f,
0.1045f, -0.7279f, 0.9658f, 0.4698f, -0.6699f, -1.5259f, 2.1664f, -0.3859f};
vector<int64_t> W_shape = {2, 1, 2, 2, 2};
vector<float> dX = {2.9389f, 0.0000f, 2.9389f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 2.9389f, 0.0000f, 2.9389f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 2.9389f,
0.0000f, 2.9389f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 2.9389f, 0.0000f, 2.9389f, 0.0000f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 2.9389f, 0.0000f,
2.9389f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 2.9389f, 0.0000f, 2.9389f, 0.0000f, 0.0000f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 2.9389f, 0.0000f, 2.9389f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 2.9389f, 0.0000f, 2.9389f, 0.0000f, 0.0000f, 0.0000f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f,
0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.0000f};
vector<int64_t> dX_shape = {2, 1, 4, 4, 4};
vector<float> dW = {7.4097f, 7.4097f, 7.4097f, 7.4097f, 7.4097f, 7.4097f, 7.4097f, 7.4097f,
7.4097f, 7.4097f, 7.4097f, 7.4097f, 7.4097f, 7.4097f, 7.4097f, 7.4097f};
vector<int64_t> dW_shape = {2, 1, 2, 2, 2};
vector<float> dB = {54.f, 54.f};
vector<int64_t> dB_shape = {2};
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW, dB}, {dX_shape, dW_shape, dB_shape});
TestConvGradOp(attrs, {dY, X, W}, {dY_shape, X_shape, W_shape}, {dX, dW, dB}, {dX_shape, dW_shape, dB_shape}, true);
}
#endif // USE_CUDA
} // namespace test
} // namespace cuda
} // namespace onnxruntime

View file

@ -4,114 +4,343 @@
#include "orttraining/training_ops/cuda/nn/conv_grad.h"
#include "core/providers/common.h"
#include "core/providers/cuda/cuda_common.h"
#include "core/providers/cuda/shared_inc/fpgeneric.h"
#include "core/platform/ort_mutex.h"
// The AlgoPerfCache and AlgoSearch here for Conv/ConvGrad is referenced on PyTorch's implementation
// from aten/src/ATen/native/cudnn/Conv_v7.cpp.
namespace onnxruntime {
namespace cuda {
#define REGISTER_GRADIENT_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
ConvGrad, \
kMSDomain, \
1, \
T, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
ConvGrad<T>);
#define REGISTER_GRADIENT_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX(ConvGrad, kMSDomain, 1, T, kCudaExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
ConvGrad<T>);
REGISTER_GRADIENT_KERNEL_TYPED(float)
REGISTER_GRADIENT_KERNEL_TYPED(double)
REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16)
cudnnStatus_t getWorkspaceSize(
const ConvolutionArgs& args,
cudnnConvolutionBwdDataAlgo_t algo, size_t* sz) {
return cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle,
args.w_desc,
args.o_desc,
args.c_desc,
args.i_desc,
algo,
sz);
using T_BwdDataPerf = cudnnConvolutionBwdDataAlgoPerf_t;
using T_BwdDataAlgo = cudnnConvolutionBwdDataAlgo_t;
using T_BwdFilterPerf = cudnnConvolutionBwdFilterAlgoPerf_t;
using T_BwdFilterAlgo = cudnnConvolutionBwdFilterAlgo_t;
cudnnStatus_t GetWorkspaceSize(const ConvArgs& args, T_BwdDataAlgo algo, size_t* workspace_size) {
return cudnnGetConvolutionBackwardDataWorkspaceSize(args.handle, args.w_desc, args.y_tensor, args.conv_desc,
args.x_tensor, algo, workspace_size);
}
cudnnStatus_t getWorkspaceSize(
const ConvolutionArgs& args,
cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz) {
return cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle,
args.i_desc,
args.o_desc,
args.c_desc,
args.w_desc,
algo,
sz);
cudnnStatus_t GetWorkspaceSize(const ConvArgs& args, T_BwdFilterAlgo algo, size_t* workspace_size) {
return cudnnGetConvolutionBackwardFilterWorkspaceSize(args.handle, args.x_tensor, args.y_tensor, args.conv_desc,
args.w_desc, algo, workspace_size);
}
// TODO: we can cache the descriptors, and only update if the input shape changes
template <typename T_Algo>
size_t GetMaxWorkspaceSize(const ConvArgs& args, const T_Algo* algo, int n_algo) {
// Calling cudaMemGetInfo is not ideal, but our cuda allocator doesn't have a way to get this info.
size_t free, total;
CUDA_CALL_THROW(cudaMemGetInfo(&free, &total));
// Assuming 10% of fragmentation.
free = static_cast<size_t>(static_cast<double>(free) * 0.9);
size_t max_workspace_size = 0;
for (int i = 0; i < n_algo; i++) {
cudnnStatus_t status;
size_t workspace_size;
status = GetWorkspaceSize(args, algo[i], &workspace_size);
if (CUDNN_STATUS_SUCCESS != status || workspace_size == 0 || workspace_size < max_workspace_size ||
workspace_size > free)
continue;
max_workspace_size = workspace_size;
}
return max_workspace_size;
}
template <typename T_Perf>
std::vector<T_Perf> GetValidAlgorithms(const T_Perf* perf_results, int n_algo) {
std::vector<T_Perf> result;
result.reserve(n_algo);
for (int i = 0; i < n_algo; i++) {
T_Perf perf = perf_results[i];
if (perf.status == CUDNN_STATUS_SUCCESS) {
result.emplace_back(perf);
}
}
ORT_ENFORCE(result.size() > 0, "No valid convolution algorithms available in CuDNN");
// TODO: This is a cuDNN bug that gave wrong results in certain strided convolution gradient setups
// when cuDNN version < 7.5. Need to add handling for such special case.
return result;
}
struct ConvParamsHash {
// ConvParams must be a POD because we read out its memory constant as char* when hashing.
static_assert(std::is_pod<ConvParams>::value, "ConvParams is not POD");
size_t operator()(const ConvParams& conv_params) const {
auto ptr = reinterpret_cast<const uint8_t*>(&conv_params);
uint32_t value = 0x811C9DC5;
for (int i = 0; i < static_cast<int>(sizeof(ConvParams)); ++i) {
value ^= ptr[i];
value *= 0x01000193;
}
return static_cast<size_t>(value);
}
};
struct ConvParamsEqual {
// ConvParams must be a POD because we read out its memory constant as char* when hashing.
static_assert(std::is_pod<ConvParams>::value, "ConvParams is not POD");
bool operator()(const ConvParams& a, const ConvParams& b) const {
auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
return memcmp(ptr1, ptr2, sizeof(ConvParams)) == 0;
}
};
template <typename T_Perf>
struct AlgoPerfCache {
mutable OrtMutex mutex;
std::unordered_map<ConvParams, T_Perf, ConvParamsHash, ConvParamsEqual> map;
bool Find(const ConvParams& params, T_Perf* result) {
std::lock_guard<OrtMutex> guard(mutex);
auto it = map.find(params);
if (it == map.end()) {
return false;
}
*result = it->second;
return true;
}
void Insert(const ConvParams& params, const T_Perf& algo_perf) {
std::lock_guard<OrtMutex> guard(mutex);
map[params] = algo_perf;
}
};
// TODO: Currently we use global AlgoPerfCache for ConvGrad only. Conv's perf cache is till per node.
// Need to apply such global cache for Conv, and move some shared code from here to conv.h/cc.
AlgoPerfCache<T_BwdDataPerf> bwd_data_algos;
AlgoPerfCache<T_BwdFilterPerf> bwd_filter_algos;
template <typename T_Perf>
struct AlgoSearch {};
template <>
struct AlgoSearch<T_BwdDataPerf> {
static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
static AlgoPerfCache<T_BwdDataPerf>& Cache() { return bwd_data_algos; }
static Status FindAlgorithms(const ConvArgs& args, const CUDAExecutionProvider* provider,
std::vector<T_BwdDataPerf>& perf_results) {
static const T_BwdDataAlgo algos[] = {
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED};
static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward data algorithms.");
int perf_count;
std::unique_ptr<T_BwdDataPerf[]> candidates(new T_BwdDataPerf[num_algos]);
if (args.params.algo_mode == OrtCudnnConvAlgoSearch::HEURISTIC) {
CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionBackwardDataAlgorithm_v7(args.handle, args.w_desc, args.y_tensor,
args.conv_desc, args.x_tensor, num_algos,
&perf_count, candidates.get()));
} else if (args.params.algo_mode == OrtCudnnConvAlgoSearch::EXHAUSTIVE) {
size_t max_workspace_size = provider->GetCudnnConvUseMaxWorkspace() ? GetMaxWorkspaceSize(args, algos, num_algos)
: AlgoSearchWorkspaceSize;
// Use GetTransientScratchBuffer() so the workspace can be freed instead of cached.
// Because the benchmarking uses a huge amount of memory, e.g. a few GBs.
IAllocatorUniquePtr<void> workspace = provider->GetTransientScratchBuffer<void>(max_workspace_size);
CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionBackwardDataAlgorithmEx(
args.handle, args.w_desc, args.w_data, args.y_tensor, args.dy_data, args.conv_desc, args.x_tensor,
args.dx_data, num_algos, &perf_count, candidates.get(), workspace.get(), max_workspace_size));
} else {
ORT_ENFORCE(false, "Algo mode should be EXHAUSTIVE (0) or HEURISTIC (1), but got ", args.params.algo_mode);
}
perf_results = GetValidAlgorithms<T_BwdDataPerf>(candidates.get(), perf_count);
return Status::OK();
}
};
template <>
struct AlgoSearch<T_BwdFilterPerf> {
static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
static AlgoPerfCache<T_BwdFilterPerf>& Cache() { return bwd_filter_algos; }
static Status FindAlgorithms(const ConvArgs& args, const CUDAExecutionProvider* provider,
std::vector<T_BwdFilterPerf>& perf_results) {
static const T_BwdFilterAlgo algos[] = {
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
};
// NOTE: - 1 because ALGO_WINOGRAD is not implemented.
static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward filter algorithms.");
std::unique_ptr<T_BwdFilterPerf[]> candidates(new T_BwdFilterPerf[num_algos]);
int perf_count;
if (args.params.algo_mode == OrtCudnnConvAlgoSearch::HEURISTIC) {
CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionBackwardFilterAlgorithm_v7(args.handle, args.x_tensor, args.y_tensor,
args.conv_desc, args.w_desc, num_algos,
&perf_count, candidates.get()));
} else if (args.params.algo_mode == OrtCudnnConvAlgoSearch::EXHAUSTIVE) {
size_t max_workspace_size = provider->GetCudnnConvUseMaxWorkspace() ? GetMaxWorkspaceSize(args, algos, num_algos)
: AlgoSearchWorkspaceSize;
// Use GetTransientScratchBuffer() so the workspace can be freed instead of cached.
// Because the benchmarking uses a huge amount of memory, e.g. a few GBs.
IAllocatorUniquePtr<void> workspace = provider->GetTransientScratchBuffer<void>(max_workspace_size);
CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle, args.x_tensor, args.x_data, args.y_tensor, args.dy_data, args.conv_desc, args.w_desc,
args.dw_data, num_algos, &perf_count, candidates.get(), workspace.get(), max_workspace_size));
} else {
ORT_ENFORCE(false, "Algo mode should be EXHAUSTIVE (0) or HEURISTIC (1), but got ", args.params.algo_mode);
}
perf_results = GetValidAlgorithms<T_BwdFilterPerf>(candidates.get(), perf_count);
return Status::OK();
}
};
template <typename T_Perf>
class AlgoIterator {
public:
AlgoIterator(const ConvArgs& args) : args_(args) {}
static Status OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results) {
perf_results.resize(1);
perf_results[0].algo = AlgoSearch<T_Perf>::DEFAULT_ALGO;
if (args.params.data_type == CUDNN_DATA_HALF) {
perf_results[0].mathType = CUDNN_TENSOR_OP_MATH;
} else {
perf_results[0].mathType = CUDNN_DEFAULT_MATH;
}
CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(args, perf_results[0].algo, &(perf_results[0].memory)));
return Status::OK();
}
Status TryAll(const CUDAExecutionProvider* provider, std::function<Status(const T_Perf& perf)> f) {
auto& cache = AlgoSearch<T_Perf>::Cache();
T_Perf algo_perf;
if (cache.Find(args_.params, &algo_perf) && f(algo_perf) == Status::OK()) {
return Status::OK();
}
std::vector<T_Perf> perf_results;
ORT_RETURN_IF_ERROR(args_.params.algo_mode == OrtCudnnConvAlgoSearch::DEFAULT
? OnlyDefaultAlgorithm(args_, perf_results)
: AlgoSearch<T_Perf>::FindAlgorithms(args_, provider, perf_results));
for (auto& algo_perf : perf_results) {
if (f(algo_perf) == Status::OK()) {
cache.Insert(args_.params, algo_perf);
return Status::OK();
}
}
ORT_ENFORCE(false, "Unable to find a valid cuDNN algorithm to run convolution.");
return Status::OK();
}
private:
const ConvArgs& args_;
};
template <typename T>
Status ConvGrad<T>::PrepareArgs(const Tensor& input, const Tensor& output, const Tensor& weight, const Tensor* bias) const {
const TensorShape& i_shape = input.Shape();
std::vector<int64_t> i_dims = i_shape.GetDims();
Status ConvGrad<T>::PrepareArgs(const Tensor& x, const Tensor& dY, const Tensor& w, Tensor* dB, Tensor* dX,
Tensor* dW) const {
const TensorShape& x_shape = x.Shape();
std::vector<int64_t> x_dims = x_shape.GetDims();
args_.x_data = reinterpret_cast<const CudaT*>(x.template Data<T>());
const TensorShape& o_shape = output.Shape();
std::vector<int64_t> o_dims = o_shape.GetDims();
const TensorShape& dy_shape = dY.Shape();
std::vector<int64_t> dy_dims = dy_shape.GetDims();
args_.dy_data = reinterpret_cast<const CudaT*>(dY.template Data<T>());
const TensorShape& w_shape = weight.Shape();
const TensorShape& w_shape = w.Shape();
std::vector<int64_t> w_dims = w_shape.GetDims();
args_.w_data = reinterpret_cast<const CudaT*>(w.template Data<T>());
// Update Attributes
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(&input, &weight));
args_.db_data = dB ? reinterpret_cast<CudaT*>(dB->template MutableData<T>()) : nullptr;
args_.dx_data = dX ? reinterpret_cast<CudaT*>(dX->template MutableData<T>()) : nullptr;
args_.dw_data = dW ? reinterpret_cast<CudaT*>(dW->template MutableData<T>()) : nullptr;
std::vector<int64_t> kernel_shape;
ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(w_shape, kernel_shape));
auto rank = kernel_shape.size();
bool x_dims_changed = (args_.last_x_dims != x_dims);
bool w_dims_changed = (args_.last_w_dims != w_dims);
if (x_dims_changed || w_dims_changed) {
if (x_dims_changed) args_.last_x_dims = x_dims;
if (w_dims_changed) args_.last_w_dims = w_dims;
std::vector<int64_t> pads(conv_attrs_.pads);
if (pads.empty()) {
pads.resize(rank * 2, 0);
}
// Update Attributes
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(&x, &w));
std::vector<int64_t> dilations(conv_attrs_.dilations);
if (dilations.empty()) {
dilations.resize(rank, 1);
}
std::vector<int64_t> kernel_shape;
ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(w_shape, kernel_shape));
auto rank = kernel_shape.size();
std::vector<int64_t> strides(conv_attrs_.strides);
if (strides.empty()) {
strides.resize(rank, 1);
}
std::vector<int64_t> pads(conv_attrs_.pads);
if (pads.empty()) {
pads.resize(rank * 2, 0);
}
// cudnn only takes 4D or 5D input, so pad dimensions if needed
if (rank < 2) {
i_dims.push_back(1);
o_dims.push_back(1);
w_dims.push_back(1);
std::vector<int64_t> dilations(conv_attrs_.dilations);
if (dilations.empty()) {
dilations.resize(rank, 1);
}
pads.insert(pads.begin() + rank, 0);
pads.insert(pads.end(), 0);
kernel_shape.push_back(1);
strides.push_back(1);
dilations.push_back(1);
}
std::vector<int64_t> strides(conv_attrs_.strides);
if (strides.empty()) {
strides.resize(rank, 1);
}
args_.handle = CudnnHandle();
args_.data_type = CudnnTensor::GetDataType<CudaT>();
ORT_RETURN_IF_ERROR(args_.i_desc.Set(i_dims, args_.data_type));
ORT_RETURN_IF_ERROR(args_.o_desc.Set(o_dims, args_.data_type));
ORT_RETURN_IF_ERROR(args_.w_desc.Set(w_dims, args_.data_type));
ORT_RETURN_IF_ERROR(args_.c_desc.Set(kernel_shape.size(), pads, strides, dilations,
gsl::narrow_cast<int>(conv_attrs_.group),
CUDNN_CROSS_CORRELATION, args_.data_type));
// cuDNN only takes 4D or 5D x tensor, so pad dimensions if needed.
if (rank < 2) {
x_dims.push_back(1);
dy_dims.push_back(1);
w_dims.push_back(1);
pads.insert(pads.begin() + rank, 0);
pads.insert(pads.end(), 0);
kernel_shape.push_back(1);
strides.push_back(1);
dilations.push_back(1);
}
if (bias) {
const TensorShape& b_shape = bias->Shape();
ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D");
std::vector<int64_t> b_dims(2 + kernel_shape.size(), 1);
b_dims[1] = b_shape[0];
ORT_RETURN_IF_ERROR(args_.b_desc.Set(b_dims, args_.data_type));
const CUDAExecutionProvider* cuda_ep =
static_cast<const CUDAExecutionProvider*>(this->Info().GetExecutionProvider());
memset(&args_.params, 0, sizeof(ConvParams));
args_.params.device_id = static_cast<int8_t>(cuda_ep->GetDeviceId());
args_.params.data_type = CudnnTensor::GetDataType<CudaT>();
args_.params.input_dim = static_cast<uint8_t>(x_dims.size());
for (size_t i = 0; i < x_dims.size(); i++) {
args_.params.input_size[i] = static_cast<int>(x_dims[i]);
args_.params.weight_size[i] = static_cast<int>(w_dims[i]);
}
for (size_t i = 0; i < rank; i++) {
args_.params.padding[i] = static_cast<int>(pads[i]);
args_.params.padding[i + rank] = static_cast<int>(pads[i + rank]);
args_.params.stride[i] = static_cast<int>(strides[i]);
args_.params.dilation[i] = static_cast<int>(dilations[i]);
}
args_.params.groups = conv_attrs_.group;
int algo_mode = cuda_ep->GetCudnnConvAlgo();
ORT_ENFORCE(algo_mode > -1 && algo_mode < 3,
"Algo mode should be EXHAUSTIVE (0), HEURISTIC (1) or DEFAULT (2), but got ", algo_mode);
args_.params.algo_mode = algo_mode;
args_.handle = CudnnHandle();
ORT_RETURN_IF_ERROR(args_.w_desc.Set(w_dims, args_.params.data_type));
ORT_RETURN_IF_ERROR(args_.x_tensor.Set(x_dims, args_.params.data_type));
ORT_RETURN_IF_ERROR(args_.y_tensor.Set(dy_dims, args_.params.data_type));
ORT_RETURN_IF_ERROR(args_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
gsl::narrow_cast<int>(conv_attrs_.group), CUDNN_CROSS_CORRELATION,
args_.params.data_type));
if (dB) {
const TensorShape& db_shape = dB->Shape();
ORT_RETURN_IF_NOT(db_shape.NumDimensions() == 1, "bias should be 1D");
std::vector<int64_t> db_dims(2 + kernel_shape.size(), 1);
db_dims[1] = db_shape[0];
ORT_RETURN_IF_ERROR(args_.b_tensor.Set(db_dims, CudnnTensor::GetDataType<CudaT>()));
}
}
return Status::OK();
@ -122,106 +351,56 @@ Status ConvGrad<T>::ComputeInternal(OpKernelContext* context) const {
const Tensor* dY = context->Input<Tensor>(0);
const Tensor* X = context->Input<Tensor>(1);
const Tensor* W = context->Input<Tensor>(2);
const int64_t M = W->Shape()[0];
Tensor* dX = context->Output(0, X->Shape());
Tensor* dW = context->Output(1, W->Shape());
Tensor* dB = context->Output(2, {M});
ORT_RETURN_IF_ERROR(PrepareArgs(*X, *dY, *W, dB));
ORT_RETURN_IF_ERROR(ComputeWeightGradient(dW, dY, X));
ORT_RETURN_IF_ERROR(ComputeInputGradient(dX, dY, W));
ORT_RETURN_IF_ERROR(ComputeBiasGradient(dB, dY));
Tensor* dB = context->Output(2, {W->Shape()[0]});
ORT_RETURN_IF_ERROR(PrepareArgs(*X, *dY, *W, dB, dX, dW));
if (dX) ORT_RETURN_IF_ERROR(ComputeInputGradient());
if (dW) ORT_RETURN_IF_ERROR(ComputeWeightGradient());
if (dB) ORT_RETURN_IF_ERROR(ComputeBiasGradient());
return Status::OK();
}
template <typename T>
Status ConvGrad<T>::ComputeWeightGradient(Tensor* dW, const Tensor* dY, const Tensor* X) const {
if (dW == nullptr) return Status::OK();
// TODO: implement the algoritm search
cudnnConvolutionBwdFilterAlgoPerf_t perf;
perf.algo = kDefaultConvBwdFilterAlgo;
if (args_.data_type == CUDNN_DATA_HALF) {
perf.mathType = CUDNN_TENSOR_OP_MATH;
} else {
perf.mathType = CUDNN_DEFAULT_MATH;
}
CUDNN_RETURN_IF_ERROR(getWorkspaceSize(args_, perf.algo, &perf.memory));
CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(args_.c_desc, perf.mathType));
void* dw_data = dW->template MutableData<T>();
const void* dy_data = dY->template Data<T>();
const void* x_data = X->template Data<T>();
IAllocatorUniquePtr<void> workspace = GetScratchBuffer<void>(perf.memory);
const auto one = Consts<CudaT>::One;
const auto zero = Consts<CudaT>::Zero;
CUDNN_RETURN_IF_ERROR(
cudnnConvolutionBackwardFilter(
args_.handle,
&one, args_.i_desc, x_data,
args_.o_desc, dy_data,
args_.c_desc, perf.algo, workspace.get(), perf.memory,
&zero, args_.w_desc, dw_data));
Status ConvGrad<T>::ComputeInputGradient() const {
AlgoIterator<T_BwdDataPerf>(args_).TryAll(
static_cast<const CUDAExecutionProvider*>(Info().GetExecutionProvider()),
[&](const T_BwdDataPerf& algo_perf) -> Status {
const auto one = Consts<CudaT>::One;
const auto zero = Consts<CudaT>::Zero;
IAllocatorUniquePtr<void> workspace = GetScratchBuffer<void>(algo_perf.memory);
CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(args_.conv_desc, algo_perf.mathType));
CUDNN_RETURN_IF_ERROR(cudnnConvolutionBackwardData(
args_.handle, &one, args_.w_desc, args_.w_data, args_.y_tensor, args_.dy_data, args_.conv_desc,
algo_perf.algo, workspace.get(), algo_perf.memory, &zero, args_.x_tensor, args_.dx_data));
return Status::OK();
});
return Status::OK();
}
template <typename T>
Status ConvGrad<T>::ComputeInputGradient(Tensor* dX, const Tensor* dY, const Tensor* W) const {
if (dX == nullptr) return Status::OK();
// TODO: implement the algoritm search
cudnnConvolutionBwdDataAlgoPerf_t perf;
perf.algo = kDefaultConvBwdDataAlgo;
if (args_.data_type == CUDNN_DATA_HALF) {
perf.mathType = CUDNN_TENSOR_OP_MATH;
} else {
perf.mathType = CUDNN_DEFAULT_MATH;
}
CUDNN_RETURN_IF_ERROR(getWorkspaceSize(args_, perf.algo, &perf.memory));
CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(args_.c_desc, perf.mathType));
void* dx_data = dX->template MutableData<T>();
const void* dy_data = dY->template Data<T>();
const void* w_data = W->template Data<T>();
IAllocatorUniquePtr<void> workspace = GetScratchBuffer<void>(perf.memory);
const auto one = Consts<CudaT>::One;
const auto zero = Consts<CudaT>::Zero;
CUDNN_RETURN_IF_ERROR(
cudnnConvolutionBackwardData(
args_.handle,
&one, args_.w_desc, w_data,
args_.o_desc, dy_data,
args_.c_desc, perf.algo, workspace.get(), perf.memory,
&zero, args_.i_desc, dx_data));
Status ConvGrad<T>::ComputeWeightGradient() const {
AlgoIterator<T_BwdFilterPerf>(args_).TryAll(
static_cast<const CUDAExecutionProvider*>(Info().GetExecutionProvider()),
[&](const T_BwdFilterPerf& algo_perf) -> Status {
const auto one = Consts<CudaT>::One;
const auto zero = Consts<CudaT>::Zero;
IAllocatorUniquePtr<void> workspace = GetScratchBuffer<void>(algo_perf.memory);
CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(args_.conv_desc, algo_perf.mathType));
CUDNN_RETURN_IF_ERROR(cudnnConvolutionBackwardFilter(
args_.handle, &one, args_.x_tensor, args_.x_data, args_.y_tensor, args_.dy_data, args_.conv_desc,
algo_perf.algo, workspace.get(), algo_perf.memory, &zero, args_.w_desc, args_.dw_data));
return Status::OK();
});
return Status::OK();
}
template <typename T>
Status ConvGrad<T>::ComputeBiasGradient(Tensor* dB, const Tensor* dY) const {
if (dB == nullptr) return Status::OK();
Status ConvGrad<T>::ComputeBiasGradient() const {
const auto one = Consts<CudaT>::One;
const auto zero = Consts<CudaT>::Zero;
void* db_data = dB->template MutableData<T>();
const void* dy_data = dY->template Data<T>();
CUDNN_RETURN_IF_ERROR(
cudnnConvolutionBackwardBias(
args_.handle,
&one, args_.o_desc, dy_data,
&zero, args_.b_desc, db_data));
CUDNN_RETURN_IF_ERROR(cudnnConvolutionBackwardBias(args_.handle, &one, args_.y_tensor, args_.dy_data, &zero,
args_.b_tensor, args_.db_data));
return Status::OK();
}

View file

@ -3,8 +3,6 @@
#pragma once
#include "core/common/common.h"
#include "core/providers/cuda/cuda_kernel.h"
#include "core/providers/cuda/cudnn_common.h"
#include "core/providers/cpu/nn/conv_attributes.h"
#include "core/providers/cuda/nn/conv.h"
@ -12,15 +10,38 @@
namespace onnxruntime {
namespace cuda {
struct ConvolutionArgs {
cudnnHandle_t handle;
// cuDNN only takes 4D or 5D x tensor.
constexpr int MAX_DIM = 3;
struct ConvParams {
int8_t device_id;
cudnnDataType_t data_type;
int input_size[2 + MAX_DIM];
uint8_t input_dim;
int weight_size[2 + MAX_DIM];
int padding[MAX_DIM * 2];
int stride[MAX_DIM];
int dilation[MAX_DIM];
int64_t groups;
int algo_mode;
};
CudnnTensor i_desc, o_desc, b_desc;
struct ConvArgs {
// Update needed if x or w's dims changed.
std::vector<int64_t> last_x_dims;
std::vector<int64_t> last_w_dims;
cudnnHandle_t handle;
ConvParams params;
CudnnTensor x_tensor, y_tensor, b_tensor;
CudnnFilterDescriptor w_desc;
CudnnConvolutionDescriptor c_desc;
ConvolutionArgs() {}
CudnnConvolutionDescriptor conv_desc;
const void* x_data;
const void* w_data;
const void* dy_data;
void* dx_data;
void* dw_data;
void* db_data;
};
template <typename T>
@ -39,19 +60,14 @@ class ConvGrad final : public CudaKernel {
Status ComputeInternal(OpKernelContext* context) const override;
protected:
mutable ConvolutionArgs args_;
Status PrepareArgs(const Tensor& input, const Tensor& output, const Tensor& weight, const Tensor* bias) const;
Status PrepareArgs(const Tensor& x, const Tensor& dY, const Tensor& w, Tensor* dB, Tensor* dX, Tensor* dW) const;
mutable ConvArgs args_;
ConvAttributes conv_attrs_;
// https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn_742/cudnn-developer-guide/index.html#tensor_ops
static constexpr auto kDefaultConvBwdDataAlgo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
static constexpr auto kDefaultConvBwdFilterAlgo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
private:
Status ComputeWeightGradient(Tensor* dW, const Tensor* dY, const Tensor* X) const;
Status ComputeInputGradient(Tensor* dX, const Tensor* dY, const Tensor* W) const;
Status ComputeBiasGradient(Tensor* dB, const Tensor* dY) const;
Status ComputeWeightGradient() const;
Status ComputeInputGradient() const;
Status ComputeBiasGradient() const;
};
} // namespace cuda