Fix unsound hipify in ROCm EP (#13269)

Some cuda related things is still left in the rocm ep statically
hipified code. Eliminate them to avoid confusion.
This commit is contained in:
cloudhan 2022-10-12 08:32:42 +08:00 committed by GitHub
parent b2353fa737
commit 1e55949a70
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 31 additions and 29 deletions

View file

@ -40,6 +40,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned";
constexpr const char* CANN = "Cann";
constexpr const char* CANN_PINNED = "CannPinned";
constexpr const char* DML = "DML";
constexpr const char* HIP = "Hip";
constexpr const char* HIP_PINNED = "HipPinned";
constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";
@ -87,7 +89,7 @@ class IAllocator {
}
/**
* Calculate the memory size for an array. The size is bounds checked using SafeInt.
* Calculate the memory size for an array. The size is bounds checked using SafeInt.
* \tparam alignment must be power of 2
* \param nmemb Number of members or elements in the array
* \param size Size of each element
@ -103,7 +105,7 @@ class IAllocator {
* \param size Size of each element
* \param out Total size required after any alignment is applied
* \return true, successful. false, overflow
* \remarks This was the original API and was implemented in the header. Replaced with the above version
* \remarks This was the original API and was implemented in the header. Replaced with the above version
* implemented in the .cc file so that the SafeInt dependency is internal.
*/
template <size_t alignment>

View file

@ -27,7 +27,7 @@ size_t GetAttentionWorkspaceSize(
Status LaunchAttentionKernel(
const hipDeviceProp_t& prop, // Device Properties
hipStream_t stream, // cuda stream
hipStream_t stream, // Hip stream
rocblas_handle& rocblas, // Rocblas handle
const size_t element_size, // Element size of input tensor
int batch_size, // Batch size (B)
@ -48,7 +48,7 @@ Status LaunchAttentionKernel(
Status LaunchDecoderAttentionKernel(
const hipDeviceProp_t& prop, // Device Properties
hipStream_t stream, // Cuda stream
hipStream_t stream, // Hip stream
rocblas_handle& rocblas, // Rocblas handle
const size_t element_size, // Element size of input tensor
const int batch_size, // Batch size (B)

View file

@ -336,7 +336,7 @@ struct GridDim {
};
};
// aligned vector generates vectorized load/store on CUDA
// aligned vector generates vectorized load/store
template<typename T, int vec_size>
struct alignas(sizeof(T) * vec_size) aligned_vector {
T val[vec_size];

View file

@ -33,8 +33,8 @@ GPUDataTransfer::~GPUDataTransfer() {
}
bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED ||
dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
}
common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const {
@ -46,7 +46,7 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e
auto& dst_device = dst.Location().device;
if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
// copy from pinned memory to GPU, this is non-blocking
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, GetStream(exec_queue_id)));
} else if (src_device.Type() == OrtDevice::GPU) {
@ -61,7 +61,7 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e
HIP_RETURN_IF_ERROR(hipStreamSynchronize(GetStream(kHipStreamDefault)));
}
} else if (src_device.Type() == OrtDevice::GPU) {
if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
// copying from GPU to pinned memory, this is non-blocking
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, GetStream(exec_queue_id)));
} else {

View file

@ -164,5 +164,5 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc
}
}
} // namespace cuda
} // namespace rocm
} // namespace onnxruntime

View file

@ -57,7 +57,7 @@ class ROCMPinnedAllocator : public IAllocator {
ROCMPinnedAllocator(OrtDevice::DeviceId device_id, const char* name)
: IAllocator(
OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id),
OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, device_id),
device_id, OrtMemTypeCPUOutput)) {}
void* Alloc(size_t size) override;

View file

@ -99,7 +99,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi
if (external_allocator_info.UseExternalAllocator()) {
AllocatorCreationInfo default_memory_info(
[external_allocator_info](OrtDevice::DeviceId id) {
return std::make_unique<ROCMExternalAllocator>(id, CUDA, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache);
return std::make_unique<ROCMExternalAllocator>(id, HIP, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache);
},
device_id,
false);
@ -109,7 +109,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi
} else {
AllocatorCreationInfo default_memory_info(
[](OrtDevice::DeviceId id) {
return std::make_unique<ROCMAllocator>(id, CUDA);
return std::make_unique<ROCMAllocator>(id, HIP);
},
device_id,
true,
@ -184,7 +184,7 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in
ROCMExecutionProvider::~ROCMExecutionProvider() {
// Prevent memory leak when people don't call
// OnRunStart and OnRunEnd when calling CudaKernel's.
// OnRunStart and OnRunEnd when calling HipKernel's.
ORT_IGNORE_RETURN_VALUE(EnqueueDeferredRelease());
// clean up thread local context caches
@ -1283,12 +1283,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual);
// Opset 17
// TODO: Enable LayerNormalization. It uses the same implementation as the old contrib op.
// TODO: Enable LayerNormalization. It uses the same implementation as the old contrib op.
// See https://github.com/microsoft/onnxruntime/pull/13066
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization);
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization);
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization);
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, double, LayerNormalization);
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization);
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization);
template <>
KernelCreateInfo BuildKernelCreateInfo<void>() {
@ -2213,10 +2213,10 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
// Opset 17
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, double, LayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
};
for (auto& function_table_entry : function_table) {
@ -2338,7 +2338,7 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
void ROCMExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
OrtDevice::DeviceId short_device_id = gsl::narrow<OrtDevice::DeviceId>(info_.device_id);
OrtDevice gpu_device{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, short_device_id};
OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
// setup ROCM allocator
@ -2370,7 +2370,7 @@ void ROCMExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manage
if (!rocm_pinned_alloc) {
AllocatorCreationInfo pinned_memory_info(
[](OrtDevice::DeviceId device_id) {
return std::make_unique<ROCMPinnedAllocator>(device_id, CUDA_PINNED);
return std::make_unique<ROCMPinnedAllocator>(device_id, HIP_PINNED);
},
pinned_device.Id());
rocm_pinned_alloc = CreateAllocator(pinned_memory_info);

View file

@ -90,7 +90,7 @@ class ROCMExecutionProvider : public IExecutionProvider {
template <typename T>
IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
// Note that OrtMemTypeCPU and OrtMemTypeCPUOutput are the same. See onnxruntime_c_api.h.
// In some CUDA async
// In some ROCm async
if (count_or_bytes == 0)
return nullptr;
return IAllocator::MakeUniquePtr<T>(GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPUOutput),
@ -128,8 +128,8 @@ class ROCMExecutionProvider : public IExecutionProvider {
hipStream_t stream_ = nullptr;
// deferred_release_buffer_pool_[my_stream] store all CPU buffers associated with
// CUDA kernels running on my_stream (type: cudaStream_t).
// Buffers' release is enqueued as a CUDA callback onto the associated stream (aka
// HIP kernels running on my_stream (type: hipStream_t).
// Buffers' release is enqueued as a HIP callback onto the associated stream (aka
// stream returned by GetComputeStream when calling AddDeferredReleaseCPUPtr) in OnRunEnd.
// Those are pointers allocated by AllocateBufferOnCPUPinned and should be released
// by CPU Allocator's Free function.

View file

@ -44,7 +44,7 @@ struct ROCMExecutionProviderInfo {
void* user_compute_stream{nullptr};
// The following OrtArenaCfg instance only characterizes the behavior of the default memory
// arena allocator and not any other auxiliary allocator that may also be part of the ROCM EP.
// For example, auxiliary allocators `CUDA_PINNED` and `CUDA_CPU` will not be configured using this
// For example, auxiliary allocators `HIP_PINNED` and `HIP_CPU` will not be configured using this
// arena config.
OrtArenaCfg* default_memory_arena_cfg{nullptr};
ROCMExecutionProviderExternalAllocatorInfo external_allocator_info{};