diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index ca5e520b8c..17a5c80f71 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -40,6 +40,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned"; constexpr const char* CANN = "Cann"; constexpr const char* CANN_PINNED = "CannPinned"; constexpr const char* DML = "DML"; +constexpr const char* HIP = "Hip"; +constexpr const char* HIP_PINNED = "HipPinned"; constexpr const char* OpenVINO_CPU = "OpenVINO_CPU"; constexpr const char* OpenVINO_GPU = "OpenVINO_GPU"; @@ -87,7 +89,7 @@ class IAllocator { } /** - * Calculate the memory size for an array. The size is bounds checked using SafeInt. + * Calculate the memory size for an array. The size is bounds checked using SafeInt. * \tparam alignment must be power of 2 * \param nmemb Number of members or elements in the array * \param size Size of each element @@ -103,7 +105,7 @@ class IAllocator { * \param size Size of each element * \param out Total size required after any alignment is applied * \return true, successful. false, overflow - * \remarks This was the original API and was implemented in the header. Replaced with the above version + * \remarks This was the original API and was implemented in the header. Replaced with the above version * implemented in the .cc file so that the SafeInt dependency is internal. */ template diff --git a/onnxruntime/contrib_ops/rocm/bert/attention_impl.h b/onnxruntime/contrib_ops/rocm/bert/attention_impl.h index 7b027679c6..ae21d02508 100644 --- a/onnxruntime/contrib_ops/rocm/bert/attention_impl.h +++ b/onnxruntime/contrib_ops/rocm/bert/attention_impl.h @@ -27,7 +27,7 @@ size_t GetAttentionWorkspaceSize( Status LaunchAttentionKernel( const hipDeviceProp_t& prop, // Device Properties - hipStream_t stream, // cuda stream + hipStream_t stream, // Hip stream rocblas_handle& rocblas, // Rocblas handle const size_t element_size, // Element size of input tensor int batch_size, // Batch size (B) @@ -48,7 +48,7 @@ Status LaunchAttentionKernel( Status LaunchDecoderAttentionKernel( const hipDeviceProp_t& prop, // Device Properties - hipStream_t stream, // Cuda stream + hipStream_t stream, // Hip stream rocblas_handle& rocblas, // Rocblas handle const size_t element_size, // Element size of input tensor const int batch_size, // Batch size (B) diff --git a/onnxruntime/core/providers/rocm/cu_inc/common.cuh b/onnxruntime/core/providers/rocm/cu_inc/common.cuh index ad11ad34d3..5c516aac65 100644 --- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh +++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh @@ -336,7 +336,7 @@ struct GridDim { }; }; -// aligned vector generates vectorized load/store on CUDA +// aligned vector generates vectorized load/store template struct alignas(sizeof(T) * vec_size) aligned_vector { T val[vec_size]; diff --git a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc index f0471aeeb6..dffe93e045 100644 --- a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc @@ -33,8 +33,8 @@ GPUDataTransfer::~GPUDataTransfer() { } bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { - return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED || - dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED; + return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED || + dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED; } common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const { @@ -46,7 +46,7 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e auto& dst_device = dst.Location().device; if (dst_device.Type() == OrtDevice::GPU) { - if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { + if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::HIP_PINNED) { // copy from pinned memory to GPU, this is non-blocking HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, GetStream(exec_queue_id))); } else if (src_device.Type() == OrtDevice::GPU) { @@ -61,7 +61,7 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e HIP_RETURN_IF_ERROR(hipStreamSynchronize(GetStream(kHipStreamDefault))); } } else if (src_device.Type() == OrtDevice::GPU) { - if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { + if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) { // copying from GPU to pinned memory, this is non-blocking HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, GetStream(exec_queue_id))); } else { diff --git a/onnxruntime/core/providers/rocm/math/softmax_warpwise_impl.cuh b/onnxruntime/core/providers/rocm/math/softmax_warpwise_impl.cuh index 98f7483ff5..2cfddce972 100644 --- a/onnxruntime/core/providers/rocm/math/softmax_warpwise_impl.cuh +++ b/onnxruntime/core/providers/rocm/math/softmax_warpwise_impl.cuh @@ -164,5 +164,5 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc } } -} // namespace cuda +} // namespace rocm } // namespace onnxruntime diff --git a/onnxruntime/core/providers/rocm/rocm_allocator.h b/onnxruntime/core/providers/rocm/rocm_allocator.h index dec10cfeac..461bb5017b 100644 --- a/onnxruntime/core/providers/rocm/rocm_allocator.h +++ b/onnxruntime/core/providers/rocm/rocm_allocator.h @@ -57,7 +57,7 @@ class ROCMPinnedAllocator : public IAllocator { ROCMPinnedAllocator(OrtDevice::DeviceId device_id, const char* name) : IAllocator( OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id), + OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, device_id), device_id, OrtMemTypeCPUOutput)) {} void* Alloc(size_t size) override; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 12eb2728a6..8af4ee8b17 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -99,7 +99,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi if (external_allocator_info.UseExternalAllocator()) { AllocatorCreationInfo default_memory_info( [external_allocator_info](OrtDevice::DeviceId id) { - return std::make_unique(id, CUDA, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache); + return std::make_unique(id, HIP, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache); }, device_id, false); @@ -109,7 +109,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi } else { AllocatorCreationInfo default_memory_info( [](OrtDevice::DeviceId id) { - return std::make_unique(id, CUDA); + return std::make_unique(id, HIP); }, device_id, true, @@ -184,7 +184,7 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in ROCMExecutionProvider::~ROCMExecutionProvider() { // Prevent memory leak when people don't call - // OnRunStart and OnRunEnd when calling CudaKernel's. + // OnRunStart and OnRunEnd when calling HipKernel's. ORT_IGNORE_RETURN_VALUE(EnqueueDeferredRelease()); // clean up thread local context caches @@ -1283,12 +1283,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual); // Opset 17 -// TODO: Enable LayerNormalization. It uses the same implementation as the old contrib op. +// TODO: Enable LayerNormalization. It uses the same implementation as the old contrib op. // See https://github.com/microsoft/onnxruntime/pull/13066 -// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization); -// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization); -// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization); -// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization); +// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization); +// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, double, LayerNormalization); +// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization); +// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization); template <> KernelCreateInfo BuildKernelCreateInfo() { @@ -2213,10 +2213,10 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, // Opset 17 - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { @@ -2338,7 +2338,7 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, void ROCMExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) { OrtDevice::DeviceId short_device_id = gsl::narrow(info_.device_id); OrtDevice gpu_device{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, short_device_id}; - OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID}; + OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID}; OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID}; // setup ROCM allocator @@ -2370,7 +2370,7 @@ void ROCMExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manage if (!rocm_pinned_alloc) { AllocatorCreationInfo pinned_memory_info( [](OrtDevice::DeviceId device_id) { - return std::make_unique(device_id, CUDA_PINNED); + return std::make_unique(device_id, HIP_PINNED); }, pinned_device.Id()); rocm_pinned_alloc = CreateAllocator(pinned_memory_info); diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h index ca8d3b363a..8a684fb4d0 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h @@ -90,7 +90,7 @@ class ROCMExecutionProvider : public IExecutionProvider { template IAllocatorUniquePtr AllocateBufferOnCPUPinned(size_t count_or_bytes) const { // Note that OrtMemTypeCPU and OrtMemTypeCPUOutput are the same. See onnxruntime_c_api.h. - // In some CUDA async + // In some ROCm async if (count_or_bytes == 0) return nullptr; return IAllocator::MakeUniquePtr(GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPUOutput), @@ -128,8 +128,8 @@ class ROCMExecutionProvider : public IExecutionProvider { hipStream_t stream_ = nullptr; // deferred_release_buffer_pool_[my_stream] store all CPU buffers associated with - // CUDA kernels running on my_stream (type: cudaStream_t). - // Buffers' release is enqueued as a CUDA callback onto the associated stream (aka + // HIP kernels running on my_stream (type: hipStream_t). + // Buffers' release is enqueued as a HIP callback onto the associated stream (aka // stream returned by GetComputeStream when calling AddDeferredReleaseCPUPtr) in OnRunEnd. // Those are pointers allocated by AllocateBufferOnCPUPinned and should be released // by CPU Allocator's Free function. diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h index aa88537991..c59ac92cc1 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h @@ -44,7 +44,7 @@ struct ROCMExecutionProviderInfo { void* user_compute_stream{nullptr}; // The following OrtArenaCfg instance only characterizes the behavior of the default memory // arena allocator and not any other auxiliary allocator that may also be part of the ROCM EP. - // For example, auxiliary allocators `CUDA_PINNED` and `CUDA_CPU` will not be configured using this + // For example, auxiliary allocators `HIP_PINNED` and `HIP_CPU` will not be configured using this // arena config. OrtArenaCfg* default_memory_arena_cfg{nullptr}; ROCMExecutionProviderExternalAllocatorInfo external_allocator_info{};