mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-27 03:11:28 +00:00
Fix unsound hipify in ROCm EP (#13269)
Some cuda related things is still left in the rocm ep statically hipified code. Eliminate them to avoid confusion.
This commit is contained in:
parent
b2353fa737
commit
1e55949a70
9 changed files with 31 additions and 29 deletions
|
|
@ -40,6 +40,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned";
|
|||
constexpr const char* CANN = "Cann";
|
||||
constexpr const char* CANN_PINNED = "CannPinned";
|
||||
constexpr const char* DML = "DML";
|
||||
constexpr const char* HIP = "Hip";
|
||||
constexpr const char* HIP_PINNED = "HipPinned";
|
||||
constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
|
||||
constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";
|
||||
|
||||
|
|
@ -87,7 +89,7 @@ class IAllocator {
|
|||
}
|
||||
|
||||
/**
|
||||
* Calculate the memory size for an array. The size is bounds checked using SafeInt.
|
||||
* Calculate the memory size for an array. The size is bounds checked using SafeInt.
|
||||
* \tparam alignment must be power of 2
|
||||
* \param nmemb Number of members or elements in the array
|
||||
* \param size Size of each element
|
||||
|
|
@ -103,7 +105,7 @@ class IAllocator {
|
|||
* \param size Size of each element
|
||||
* \param out Total size required after any alignment is applied
|
||||
* \return true, successful. false, overflow
|
||||
* \remarks This was the original API and was implemented in the header. Replaced with the above version
|
||||
* \remarks This was the original API and was implemented in the header. Replaced with the above version
|
||||
* implemented in the .cc file so that the SafeInt dependency is internal.
|
||||
*/
|
||||
template <size_t alignment>
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ size_t GetAttentionWorkspaceSize(
|
|||
|
||||
Status LaunchAttentionKernel(
|
||||
const hipDeviceProp_t& prop, // Device Properties
|
||||
hipStream_t stream, // cuda stream
|
||||
hipStream_t stream, // Hip stream
|
||||
rocblas_handle& rocblas, // Rocblas handle
|
||||
const size_t element_size, // Element size of input tensor
|
||||
int batch_size, // Batch size (B)
|
||||
|
|
@ -48,7 +48,7 @@ Status LaunchAttentionKernel(
|
|||
|
||||
Status LaunchDecoderAttentionKernel(
|
||||
const hipDeviceProp_t& prop, // Device Properties
|
||||
hipStream_t stream, // Cuda stream
|
||||
hipStream_t stream, // Hip stream
|
||||
rocblas_handle& rocblas, // Rocblas handle
|
||||
const size_t element_size, // Element size of input tensor
|
||||
const int batch_size, // Batch size (B)
|
||||
|
|
|
|||
|
|
@ -336,7 +336,7 @@ struct GridDim {
|
|||
};
|
||||
};
|
||||
|
||||
// aligned vector generates vectorized load/store on CUDA
|
||||
// aligned vector generates vectorized load/store
|
||||
template<typename T, int vec_size>
|
||||
struct alignas(sizeof(T) * vec_size) aligned_vector {
|
||||
T val[vec_size];
|
||||
|
|
|
|||
|
|
@ -33,8 +33,8 @@ GPUDataTransfer::~GPUDataTransfer() {
|
|||
}
|
||||
|
||||
bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
|
||||
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED ||
|
||||
dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
|
||||
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
|
||||
dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
|
||||
}
|
||||
|
||||
common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const {
|
||||
|
|
@ -46,7 +46,7 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e
|
|||
auto& dst_device = dst.Location().device;
|
||||
|
||||
if (dst_device.Type() == OrtDevice::GPU) {
|
||||
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
|
||||
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
|
||||
// copy from pinned memory to GPU, this is non-blocking
|
||||
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, GetStream(exec_queue_id)));
|
||||
} else if (src_device.Type() == OrtDevice::GPU) {
|
||||
|
|
@ -61,7 +61,7 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e
|
|||
HIP_RETURN_IF_ERROR(hipStreamSynchronize(GetStream(kHipStreamDefault)));
|
||||
}
|
||||
} else if (src_device.Type() == OrtDevice::GPU) {
|
||||
if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
|
||||
if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
|
||||
// copying from GPU to pinned memory, this is non-blocking
|
||||
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, GetStream(exec_queue_id)));
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -164,5 +164,5 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc
|
|||
}
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace rocm
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ class ROCMPinnedAllocator : public IAllocator {
|
|||
ROCMPinnedAllocator(OrtDevice::DeviceId device_id, const char* name)
|
||||
: IAllocator(
|
||||
OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
|
||||
OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id),
|
||||
OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, device_id),
|
||||
device_id, OrtMemTypeCPUOutput)) {}
|
||||
|
||||
void* Alloc(size_t size) override;
|
||||
|
|
|
|||
|
|
@ -99,7 +99,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi
|
|||
if (external_allocator_info.UseExternalAllocator()) {
|
||||
AllocatorCreationInfo default_memory_info(
|
||||
[external_allocator_info](OrtDevice::DeviceId id) {
|
||||
return std::make_unique<ROCMExternalAllocator>(id, CUDA, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache);
|
||||
return std::make_unique<ROCMExternalAllocator>(id, HIP, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache);
|
||||
},
|
||||
device_id,
|
||||
false);
|
||||
|
|
@ -109,7 +109,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi
|
|||
} else {
|
||||
AllocatorCreationInfo default_memory_info(
|
||||
[](OrtDevice::DeviceId id) {
|
||||
return std::make_unique<ROCMAllocator>(id, CUDA);
|
||||
return std::make_unique<ROCMAllocator>(id, HIP);
|
||||
},
|
||||
device_id,
|
||||
true,
|
||||
|
|
@ -184,7 +184,7 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in
|
|||
|
||||
ROCMExecutionProvider::~ROCMExecutionProvider() {
|
||||
// Prevent memory leak when people don't call
|
||||
// OnRunStart and OnRunEnd when calling CudaKernel's.
|
||||
// OnRunStart and OnRunEnd when calling HipKernel's.
|
||||
ORT_IGNORE_RETURN_VALUE(EnqueueDeferredRelease());
|
||||
|
||||
// clean up thread local context caches
|
||||
|
|
@ -1283,12 +1283,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual);
|
||||
|
||||
// Opset 17
|
||||
// TODO: Enable LayerNormalization. It uses the same implementation as the old contrib op.
|
||||
// TODO: Enable LayerNormalization. It uses the same implementation as the old contrib op.
|
||||
// See https://github.com/microsoft/onnxruntime/pull/13066
|
||||
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
|
||||
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization);
|
||||
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization);
|
||||
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization);
|
||||
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
|
||||
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, double, LayerNormalization);
|
||||
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization);
|
||||
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization);
|
||||
|
||||
template <>
|
||||
KernelCreateInfo BuildKernelCreateInfo<void>() {
|
||||
|
|
@ -2213,10 +2213,10 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
|
||||
|
||||
// Opset 17
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, double, LayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
|
||||
};
|
||||
|
||||
for (auto& function_table_entry : function_table) {
|
||||
|
|
@ -2338,7 +2338,7 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
|
|||
void ROCMExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
|
||||
OrtDevice::DeviceId short_device_id = gsl::narrow<OrtDevice::DeviceId>(info_.device_id);
|
||||
OrtDevice gpu_device{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, short_device_id};
|
||||
OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
|
||||
OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
|
||||
OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
|
||||
|
||||
// setup ROCM allocator
|
||||
|
|
@ -2370,7 +2370,7 @@ void ROCMExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manage
|
|||
if (!rocm_pinned_alloc) {
|
||||
AllocatorCreationInfo pinned_memory_info(
|
||||
[](OrtDevice::DeviceId device_id) {
|
||||
return std::make_unique<ROCMPinnedAllocator>(device_id, CUDA_PINNED);
|
||||
return std::make_unique<ROCMPinnedAllocator>(device_id, HIP_PINNED);
|
||||
},
|
||||
pinned_device.Id());
|
||||
rocm_pinned_alloc = CreateAllocator(pinned_memory_info);
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ class ROCMExecutionProvider : public IExecutionProvider {
|
|||
template <typename T>
|
||||
IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
|
||||
// Note that OrtMemTypeCPU and OrtMemTypeCPUOutput are the same. See onnxruntime_c_api.h.
|
||||
// In some CUDA async
|
||||
// In some ROCm async
|
||||
if (count_or_bytes == 0)
|
||||
return nullptr;
|
||||
return IAllocator::MakeUniquePtr<T>(GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPUOutput),
|
||||
|
|
@ -128,8 +128,8 @@ class ROCMExecutionProvider : public IExecutionProvider {
|
|||
hipStream_t stream_ = nullptr;
|
||||
|
||||
// deferred_release_buffer_pool_[my_stream] store all CPU buffers associated with
|
||||
// CUDA kernels running on my_stream (type: cudaStream_t).
|
||||
// Buffers' release is enqueued as a CUDA callback onto the associated stream (aka
|
||||
// HIP kernels running on my_stream (type: hipStream_t).
|
||||
// Buffers' release is enqueued as a HIP callback onto the associated stream (aka
|
||||
// stream returned by GetComputeStream when calling AddDeferredReleaseCPUPtr) in OnRunEnd.
|
||||
// Those are pointers allocated by AllocateBufferOnCPUPinned and should be released
|
||||
// by CPU Allocator's Free function.
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ struct ROCMExecutionProviderInfo {
|
|||
void* user_compute_stream{nullptr};
|
||||
// The following OrtArenaCfg instance only characterizes the behavior of the default memory
|
||||
// arena allocator and not any other auxiliary allocator that may also be part of the ROCM EP.
|
||||
// For example, auxiliary allocators `CUDA_PINNED` and `CUDA_CPU` will not be configured using this
|
||||
// For example, auxiliary allocators `HIP_PINNED` and `HIP_CPU` will not be configured using this
|
||||
// arena config.
|
||||
OrtArenaCfg* default_memory_arena_cfg{nullptr};
|
||||
ROCMExecutionProviderExternalAllocatorInfo external_allocator_info{};
|
||||
|
|
|
|||
Loading…
Reference in a new issue