Fix unsound hipify in ROCm EP (#13269)

Some cuda related things is still left in the rocm ep statically hipified code. Eliminate them to avoid confusion.
2026-06-27 03:11:28 +00:00 · 2022-10-12 08:32:42 +08:00 · 2022-10-12 08:32:42 +08:00 · 1e55949a70
commit 1e55949a70
parent b2353fa737
9 changed files with 31 additions and 29 deletions
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@ -40,6 +40,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned";
 constexpr const char* CANN = "Cann";
 constexpr const char* CANN_PINNED = "CannPinned";
 constexpr const char* DML = "DML";
+constexpr const char* HIP = "Hip";
+constexpr const char* HIP_PINNED = "HipPinned";
 constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
 constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";

@ -87,7 +89,7 @@ class IAllocator {
  }

  /**
-  * Calculate the memory size for an array. The size is bounds checked using SafeInt. 
+  * Calculate the memory size for an array. The size is bounds checked using SafeInt.
   * \tparam alignment must be power of 2
   * \param nmemb Number of members or elements in the array
   * \param size Size of each element
@ -103,7 +105,7 @@ class IAllocator {
   * \param size Size of each element
   * \param out Total size required after any alignment is applied
   * \return true, successful. false, overflow
-   * \remarks This was the original API and was implemented in the header. Replaced with the above version 
+   * \remarks This was the original API and was implemented in the header. Replaced with the above version
   *          implemented in the .cc file so that the SafeInt dependency is internal.
   */
  template <size_t alignment>
--- a/onnxruntime/contrib_ops/rocm/bert/attention_impl.h
+++ b/onnxruntime/contrib_ops/rocm/bert/attention_impl.h
@ -27,7 +27,7 @@ size_t GetAttentionWorkspaceSize(

 Status LaunchAttentionKernel(
    const hipDeviceProp_t& prop,               // Device Properties
-    hipStream_t stream,                        // cuda stream
+    hipStream_t stream,                        // Hip stream
    rocblas_handle& rocblas,                   // Rocblas handle
    const size_t element_size,                 // Element size of input tensor
    int batch_size,                            // Batch size (B)
@ -48,7 +48,7 @@ Status LaunchAttentionKernel(

 Status LaunchDecoderAttentionKernel(
    const hipDeviceProp_t& prop,      // Device Properties
-    hipStream_t stream,               // Cuda stream
+    hipStream_t stream,               // Hip stream
    rocblas_handle& rocblas,          // Rocblas handle
    const size_t element_size,        // Element size of input tensor
    const int batch_size,             // Batch size (B)
--- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
@ -336,7 +336,7 @@ struct GridDim {
  };
 };

-// aligned vector generates vectorized load/store on CUDA
+// aligned vector generates vectorized load/store
 template<typename T, int vec_size>
 struct alignas(sizeof(T) * vec_size) aligned_vector {
  T val[vec_size];
--- a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc
+++ b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc
@ -33,8 +33,8 @@ GPUDataTransfer::~GPUDataTransfer() {
 }

 bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
-  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED ||
-         dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
+  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
+         dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
 }

 common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const {
@ -46,7 +46,7 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e
  auto& dst_device = dst.Location().device;

  if (dst_device.Type() == OrtDevice::GPU) {
-    if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
+    if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
      // copy from pinned memory to GPU, this is non-blocking
      HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, GetStream(exec_queue_id)));
    } else if (src_device.Type() == OrtDevice::GPU) {
@ -61,7 +61,7 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e
      HIP_RETURN_IF_ERROR(hipStreamSynchronize(GetStream(kHipStreamDefault)));
    }
  } else if (src_device.Type() == OrtDevice::GPU) {
-    if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
+    if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
      // copying from GPU to pinned memory, this is non-blocking
      HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, GetStream(exec_queue_id)));
    } else {
--- a/onnxruntime/core/providers/rocm/math/softmax_warpwise_impl.cuh
+++ b/onnxruntime/core/providers/rocm/math/softmax_warpwise_impl.cuh
@ -164,5 +164,5 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc
  }
 }

-}  // namespace cuda
+}  // namespace rocm
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/rocm/rocm_allocator.h
+++ b/onnxruntime/core/providers/rocm/rocm_allocator.h
@ -57,7 +57,7 @@ class ROCMPinnedAllocator : public IAllocator {
  ROCMPinnedAllocator(OrtDevice::DeviceId device_id, const char* name)
      : IAllocator(
            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id),
+                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, device_id),
                          device_id, OrtMemTypeCPUOutput)) {}

  void* Alloc(size_t size) override;
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@ -99,7 +99,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi
  if (external_allocator_info.UseExternalAllocator()) {
    AllocatorCreationInfo default_memory_info(
        [external_allocator_info](OrtDevice::DeviceId id) {
-          return std::make_unique<ROCMExternalAllocator>(id, CUDA, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache);
+          return std::make_unique<ROCMExternalAllocator>(id, HIP, external_allocator_info.alloc, external_allocator_info.free, external_allocator_info.empty_cache);
        },
        device_id,
        false);
@ -109,7 +109,7 @@ AllocatorPtr ROCMExecutionProvider::CreateRocmAllocator(OrtDevice::DeviceId devi
  } else {
    AllocatorCreationInfo default_memory_info(
        [](OrtDevice::DeviceId id) {
-          return std::make_unique<ROCMAllocator>(id, CUDA);
+          return std::make_unique<ROCMAllocator>(id, HIP);
        },
        device_id,
        true,
@ -184,7 +184,7 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in

 ROCMExecutionProvider::~ROCMExecutionProvider() {
  // Prevent memory leak when people don't call
-  // OnRunStart and OnRunEnd when calling CudaKernel's.
+  // OnRunStart and OnRunEnd when calling HipKernel's.
  ORT_IGNORE_RETURN_VALUE(EnqueueDeferredRelease());

  // clean up thread local context caches
@ -1283,12 +1283,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual);

 // Opset 17
-// TODO: Enable LayerNormalization. It uses the same implementation as the old contrib op. 
+// TODO: Enable LayerNormalization. It uses the same implementation as the old contrib op.
 // See https://github.com/microsoft/onnxruntime/pull/13066
-// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
-// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization);
-// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization);
-// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, double, LayerNormalization);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization);

 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@ -2213,10 +2213,10 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,

      // Opset 17
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, double, LayerNormalization)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
  };

  for (auto& function_table_entry : function_table) {
@ -2338,7 +2338,7 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
 void ROCMExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
  OrtDevice::DeviceId short_device_id = gsl::narrow<OrtDevice::DeviceId>(info_.device_id);
  OrtDevice gpu_device{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, short_device_id};
-  OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
+  OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
  OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};

  // setup ROCM allocator
@ -2370,7 +2370,7 @@ void ROCMExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manage
    if (!rocm_pinned_alloc) {
      AllocatorCreationInfo pinned_memory_info(
          [](OrtDevice::DeviceId device_id) {
-            return std::make_unique<ROCMPinnedAllocator>(device_id, CUDA_PINNED);
+            return std::make_unique<ROCMPinnedAllocator>(device_id, HIP_PINNED);
          },
          pinned_device.Id());
      rocm_pinned_alloc = CreateAllocator(pinned_memory_info);
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@ -90,7 +90,7 @@ class ROCMExecutionProvider : public IExecutionProvider {
  template <typename T>
  IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
    // Note that OrtMemTypeCPU and OrtMemTypeCPUOutput are the same. See onnxruntime_c_api.h.
-    // In some CUDA async
+    // In some ROCm async
    if (count_or_bytes == 0)
      return nullptr;
    return IAllocator::MakeUniquePtr<T>(GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPUOutput),
@ -128,8 +128,8 @@ class ROCMExecutionProvider : public IExecutionProvider {
  hipStream_t stream_ = nullptr;

  // deferred_release_buffer_pool_[my_stream] store all CPU buffers associated with
-  // CUDA kernels running on my_stream (type: cudaStream_t).
-  // Buffers' release is enqueued as a CUDA callback onto the associated stream (aka
+  // HIP kernels running on my_stream (type: hipStream_t).
+  // Buffers' release is enqueued as a HIP callback onto the associated stream (aka
  // stream returned by GetComputeStream when calling AddDeferredReleaseCPUPtr) in OnRunEnd.
  // Those are pointers allocated by AllocateBufferOnCPUPinned and should be released
  // by CPU Allocator's Free function.
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h
@ -44,7 +44,7 @@ struct ROCMExecutionProviderInfo {
  void* user_compute_stream{nullptr};
  // The following OrtArenaCfg instance only characterizes the behavior of the default memory
  // arena allocator and not any other auxiliary allocator that may also be part of the ROCM EP.
-  // For example, auxiliary allocators `CUDA_PINNED` and `CUDA_CPU` will not be configured using this
+  // For example, auxiliary allocators `HIP_PINNED` and `HIP_CPU` will not be configured using this
  // arena config.
  OrtArenaCfg* default_memory_arena_cfg{nullptr};
  ROCMExecutionProviderExternalAllocatorInfo external_allocator_info{};