CUDA EP vs ROCM EP hipify audit (#17776)

Migrate most CUDA EP improvements and changes to ROCM EP. The process involves using hipify against all CUDA EP files (i.e. do not exclude any files from onnxruntime_rocm_hipify.cmake) then vimdiff compare them against the ROCM EP files that are under source control and pull in most changes. These changes include functional as well as formatting and makes comparing CUDA EP and ROCM EP easier, though it makes the PR diff somewhat less obvious due to formatting changes. - hipify audit of onnxruntime/core/providers/rocm, enable ops - Loop - Scan - hipify audit of onnxruntime/contrib_ops/rocm - fix contrib ops search implementation - enable more contrib ops - Affine - ComplexMul - ConvTransposeWithDynamicPads - Crop - DynamicSlice - FFT [Rfft, Irfft] - GreedySearch - ImageScaler - ParametricSoftplus - ScaledTanh - ThresholdRelu --------- Co-authored-by: cloudhan <cloudhan@outlook.com>
2026-05-18 21:21:17 +00:00 · 2023-10-12 19:13:53 -07:00 · 2023-10-12 19:13:53 -07:00 · 07317316cc
commit 07317316cc
parent ba7f20ac57
39 changed files with 1493 additions and 1460 deletions
--- a/cmake/onnxruntime_providers_migraphx.cmake
+++ b/cmake/onnxruntime_providers_migraphx.cmake
@ -42,7 +42,7 @@
  onnxruntime_add_include_to_target(onnxruntime_providers_migraphx onnxruntime_common onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface)
  add_dependencies(onnxruntime_providers_migraphx onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
  target_link_libraries(onnxruntime_providers_migraphx PRIVATE ${migraphx_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-  target_include_directories(onnxruntime_providers_migraphx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR})
+  target_include_directories(onnxruntime_providers_migraphx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime)
  set_target_properties(onnxruntime_providers_migraphx PROPERTIES LINKER_LANGUAGE CXX)
  set_target_properties(onnxruntime_providers_migraphx PROPERTIES FOLDER "ONNXRuntime")
  target_compile_definitions(onnxruntime_providers_migraphx PRIVATE ONNXIFI_BUILD_LIBRARY=1)
@ -72,4 +72,4 @@
          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR}
-  )
+  )
--- a/cmake/onnxruntime_providers_rocm.cmake
+++ b/cmake/onnxruntime_providers_rocm.cmake
@ -10,6 +10,7 @@
  find_package(hiprand REQUIRED)
  find_package(rocblas REQUIRED)
  find_package(MIOpen REQUIRED)
+  find_package(hipfft REQUIRED)

  # MIOpen version
  if(NOT DEFINED ENV{MIOPEN_PATH})
@ -48,7 +49,7 @@

  find_library(RCCL_LIB rccl REQUIRED)
  find_library(ROCTRACER_LIB roctracer64 REQUIRED)
-  set(ONNXRUNTIME_ROCM_LIBS roc::rocblas MIOpen ${RCCL_LIB} ${ROCTRACER_LIB})
+  set(ONNXRUNTIME_ROCM_LIBS roc::rocblas MIOpen hip::hipfft ${RCCL_LIB} ${ROCTRACER_LIB})

  file(GLOB_RECURSE onnxruntime_providers_rocm_cc_srcs CONFIGURE_DEPENDS
    "${ONNXRUNTIME_ROOT}/core/providers/rocm/*.h"
@ -219,4 +220,4 @@
  install(TARGETS onnxruntime_providers_rocm
          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@ -48,15 +48,6 @@ set(contrib_ops_excluded_files
  "diffusion/group_norm_impl.cu"
  "diffusion/group_norm_impl.h"
  "diffusion/nhwc_conv.cc"
-  "math/complex_mul.cc"
-  "math/complex_mul.h"
-  "math/complex_mul_impl.cu"
-  "math/complex_mul_impl.h"
-  "math/cufft_plan_cache.h"
-  "math/fft_ops.cc"
-  "math/fft_ops.h"
-  "math/fft_ops_impl.cu"
-  "math/fft_ops_impl.h"
  "quantization/attention_quantization.cc"
  "quantization/attention_quantization.h"
  "quantization/attention_quantization_impl.cu"
@ -86,19 +77,6 @@ set(contrib_ops_excluded_files
  "quantization/qordered_ops/qordered_unary_ops.cc"
  "quantization/qordered_ops/qordered_unary_ops_impl.h"
  "quantization/qordered_ops/qordered_unary_ops_impl.cu"
-  "tensor/crop.cc"
-  "tensor/crop.h"
-  "tensor/crop_impl.cu"
-  "tensor/crop_impl.h"
-  "tensor/dynamicslice.cc"
-  "tensor/image_scaler.cc"
-  "tensor/image_scaler.h"
-  "tensor/image_scaler_impl.cu"
-  "tensor/image_scaler_impl.h"
-  "transformers/greedy_search.cc"
-  "transformers/greedy_search.h"
-  "conv_transpose_with_dynamic_pads.cc"
-  "conv_transpose_with_dynamic_pads.h"
  "cuda_contrib_kernels.cc"
  "cuda_contrib_kernels.h"
  "inverse.cc"
@ -119,10 +97,6 @@ endif()

 set(provider_excluded_files
  "atomic/common.cuh"
-  "controlflow/loop.cc"
-  "controlflow/loop.h"
-  "controlflow/scan.cc"
-  "controlflow/scan.h"
  "cu_inc/common.cuh"
  "math/einsum_utils/einsum_auxiliary_ops.cc"
  "math/einsum_utils/einsum_auxiliary_ops.h"
@ -170,7 +144,6 @@ set(provider_excluded_files
  "cuda_memory_check.h"
  "cuda_fence.cc"
  "cuda_fence.h"
-  "cuda_fwd.h"
  "cuda_kernel.h"
  "cuda_pch.cc"
  "cuda_pch.h"
--- a/onnxruntime/contrib_ops/cuda/transformers/greedy_search.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/greedy_search.cc
@ -48,10 +48,12 @@ GreedySearch::GreedySearch(const OpKernelInfo& info)

  SetConsoleDumper(&g_cuda_dumper_greedysearch);

+#ifndef USE_ROCM
  cuda_device_prop_ = &reinterpret_cast<const CUDAExecutionProvider*>(info.GetExecutionProvider())->GetDeviceProp();

  cuda_device_arch_ = static_cast<const cudaDeviceProp*>(cuda_device_prop_)->major * 100 +
                      static_cast<const cudaDeviceProp*>(cuda_device_prop_)->minor * 10;
+#endif
 }

 Status GreedySearch::ComputeInternal(OpKernelContext* context) const {
--- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
@ -29,6 +29,14 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, FusedMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, FusedMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, FusedMatMul);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, RelativePositionBias);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, RelativePositionBias);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GatedRelativePositionBias);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GatedRelativePositionBias);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, RemovePadding);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, RemovePadding);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, RestorePadding);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, RestorePadding);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Rfft);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, Rfft);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Rfft);
@ -52,6 +60,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, Affine);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Attention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Attention);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, PackedAttention);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, PackedAttention);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, PackedMultiHeadAttention);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, PackedMultiHeadAttention);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BeamSearch);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ConvTransposeWithDynamicPads);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, Crop);
@ -61,12 +73,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DecoderAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DecoderAttention);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DecoderMaskedMultiHeadAttention);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedMultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int64_t, DynamicSlice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, GreedySearch);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, GroupNorm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, NhwcConv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, NhwcConv);
@ -113,6 +124,17 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FastGelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, TransposeMatMul);  // backward compatibility
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedMatMul);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedLayerNormalization);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedGelu);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QuantizeWithOrder);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, DequantizeWithOrder);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedAttention);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedLongformerAttention);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DecoderMaskedSelfAttention);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedSelfAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DecoderMaskedMultiHeadAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedMultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GemmFastGelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GemmFastGelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, GemmFastGelu);
@ -139,6 +161,7 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
  return info;
 }

+// clang-format off
 Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
  static const BuildKernelCreateInfoFn function_table[] = {
    BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
@ -162,70 +185,73 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, FusedMatMul)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, FusedMatMul)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, FusedMatMul)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Rfft)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, Rfft)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Rfft)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Irfft)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, Irfft)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Irfft)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ComplexMul)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ComplexMul)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ComplexMulConj)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ComplexMulConj)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain,
-                                                          1, NGramRepeatBlock)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, RelativePositionBias)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, RelativePositionBias)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GatedRelativePositionBias)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GatedRelativePositionBias)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, RemovePadding)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, RemovePadding)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, RestorePadding)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, RestorePadding)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Rfft)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, Rfft)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Rfft)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Irfft)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, Irfft)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Irfft)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ComplexMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ComplexMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ComplexMulConj)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ComplexMulConj)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, NGramRepeatBlock)>,

    // These ops were experimental ops in onnx domain which have been removed now. We add them here as
    // contrib ops to maintain backward compatibility
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, Affine)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, Affine)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, Affine)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, Affine)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, Affine)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, Affine)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Attention)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Attention)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, PackedAttention)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, PackedAttention)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, PackedMultiHeadAttention)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, PackedMultiHeadAttention)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BeamSearch)>,
-
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ConvTransposeWithDynamicPads)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, Crop)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, Crop)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop)>,
-
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ConvTransposeWithDynamicPads)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, Crop)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, Crop)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain,
-                                                                1, float, DecoderAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain,
-                                                                1, MLFloat16, DecoderAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DecoderMaskedMultiHeadAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedMultiHeadAttention)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int64_t, DynamicSlice)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DecoderAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DecoderAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int64_t, DynamicSlice)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, GreedySearch)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, GroupNorm)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, NhwcConv)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, NhwcConv)>,
-
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ImageScaler)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ImageScaler)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ImageScaler)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain,
-                                                                1, float, LongformerAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain,
-                                                                1, MLFloat16, LongformerAttention)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ImageScaler)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ImageScaler)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ImageScaler)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, LongformerAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, LongformerAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Sampling)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SkipLayerNormalization)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SkipSimplifiedLayerNormalization)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ThresholdedRelu)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ThresholdedRelu)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ThresholdedRelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ThresholdedRelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ThresholdedRelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ThresholdedRelu)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 16, float_float_float, LayerNormalization)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 16, double_double_double, LayerNormalization)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 16, MLFloat16_float_MLFloat16, LayerNormalization)>,
@ -238,7 +264,6 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float_float_MLFloat16, SimplifiedLayerNormalization)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16_float_float, SimplifiedLayerNormalization)>,
    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Inverse)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Trilu)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BiasSoftmax)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BiasDropout)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BitmaskDropout)>,
@ -249,16 +274,25 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, int8_t_MLFloat16, DequantizeLinear)>,
    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, uint8_t_MLFloat16, DequantizeLinear)>,
    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int8_t, QAttention)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int8_t, QAttention)>
-
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int8_t, QAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Trilu)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FastGelu)>,
    // TransposedMatMul is still here for backward compatibility
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, TransposeMatMul)>,  // backward compatibility
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain,
-                                                                1, float, FusedConv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain,
-                                                                1, MLFloat16, FusedConv)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, FusedConv)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, FusedConv)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedMatMul)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedLayerNormalization)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedGelu)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QuantizeWithOrder)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, DequantizeWithOrder)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedAttention)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, QOrderedLongformerAttention)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DecoderMaskedSelfAttention)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedSelfAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DecoderMaskedMultiHeadAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedMultiHeadAttention)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GemmFastGelu)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GemmFastGelu)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, GemmFastGelu)>,
@ -278,6 +312,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllToAll)>,
 #endif
+
  };

  for (auto& function_table_entry : function_table) {
@ -289,6 +324,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {

  return Status::OK();
 }
+// clang-format on

 }  // namespace rocm
 }  // namespace contrib
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@ -243,7 +243,7 @@ struct CUDA_Provider : Provider {
    cuda_options.arena_extend_strategy = internal_options.arena_extend_strategy;
    cuda_options.do_copy_in_default_stream = internal_options.do_copy_in_default_stream;
    cuda_options.has_user_compute_stream = internal_options.has_user_compute_stream;
-    // The 'has_user_compute_stream' of the OrtCUDAProviderOptionsV2 instance can be set byC API UpdateCUDAProviderOptionsWithValue() as well.
+    // The 'has_user_compute_stream' of the OrtCUDAProviderOptionsV2 instance can be set by C API UpdateCUDAProviderOptionsWithValue() as well.
    // We only set the 'has_user_compute_stream' of the OrtCUDAProviderOptionsV2 instance if it is provided in options
    if (options.find("has_user_compute_stream") != options.end()) {
      cuda_options.user_compute_stream = internal_options.user_compute_stream;
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
@ -102,8 +102,9 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
      }
      s_.y_dims = gsl::make_span(y_dims);

-      if (w_dims_changed)
+      if (w_dims_changed) {
        ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType<CudaT>()));
+      }

      // Special case when there is a dim value of 0 in the shape.
      // Return only after we have cached the following for subsequent runs :
--- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
@ -2,8 +2,6 @@
 // Licensed under the MIT License.

 #pragma once
-#include <type_traits>
-#include <memory>
 #include <stdint.h>
 #include <vector>
 #include <mutex>
@ -294,6 +292,14 @@ __device__ __inline__ T _Gelu(T a) {
  return a * _Normcdf(a);
 }

+template <>
+__device__ __inline__ half _Gelu(half a) {
+  const half kHalf = half(0.5);
+  const half kOne = half(1.0);
+  const half kAlpha = half(M_SQRT1_2);
+  return a * kHalf * (kOne + _Erf(kAlpha * a));
+}
+
 template <typename T>
 __device__ __inline__ T _Mod(T a, T b) {
  T r = a % b;
@ -348,21 +354,19 @@ struct GridDim {
  };
 };

-// aligned vector generates vectorized load/store
+// aligned vector generates vectorized load/store on ROCM
 template <typename T, int vec_size>
 struct alignas(sizeof(T) * vec_size) aligned_vector {
  T val[vec_size];
 };

-#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N)     \
+#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N)      \
  HIP_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
-  if (id >= N)                                         \
+  if (id >= N)                                          \
    return;

 // HIP_KERNEL_ASSERT is a macro that wraps an assert() call inside rocm kernels.
-// TODO ROCM added support recently, should verify.
-#define HIP_KERNEL_ASSERT(...)
-// #define HIP_KERNEL_ASSERT(...) assert(__VA_ARGS__)
+#define HIP_KERNEL_ASSERT(...) assert(__VA_ARGS__)

 // WARP related definitions and functions
 constexpr int GPU_WARP_SIZE = warpSize;
--- a/onnxruntime/core/providers/rocm/fpgeneric.cu
+++ b/onnxruntime/core/providers/rocm/fpgeneric.cu
@ -68,7 +68,7 @@ rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocbla
 rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, const half* x, int incx, half* y, int incy) {
  dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
  dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
-  CopyVectorHalf<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(x, incx, y, incy, n);
+  CopyVectorHalf<<<dimGrid, dimBlock, 0, stream>>>(x, incx, y, incy, n);
  return rocblas_status_success;
 }

@ -76,6 +76,6 @@ rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, cons
                                onnxruntime::BFloat16* y, int incy) {
  dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
  dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
-  CopyVectorBFloat16<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(x, incx, y, incy, n);
+  CopyVectorBFloat16<<<dimGrid, dimBlock, 0, stream>>>(x, incx, y, incy, n);
  return rocblas_status_success;
 }
--- a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc
+++ b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc
@ -2,14 +2,15 @@
 // Licensed under the MIT License.

 #include "core/providers/shared_library/provider_api.h"
-#include "core/providers/rocm/rocm_common.h"
-#include "core/providers/rocm/gpu_data_transfer.h"

-// use default stream for copy for now, to avoid racing in BFC arena as in issue #4829
-// note this may cause some models to run slower if there are ops running on CPU
-// so we leave it as optional, in case user need the previous behavior
-// a full fix to BFC arena is being looked at, and once it's in, we can revert this change
+#include "core/providers/rocm/gpu_data_transfer.h"
+#include "core/providers/rocm/rocm_common.h"
+
 namespace onnxruntime {
+GPUDataTransfer::GPUDataTransfer() {}
+
+GPUDataTransfer::~GPUDataTransfer() {}
+
 bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
         dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
@ -34,12 +35,12 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
    } else {
      // copy from other CPU memory to GPU, this is blocking
      HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
-      HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));  // TODO: still need stream sync? since already blocking
+      HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
    }
  } else if (src_device.Type() == OrtDevice::GPU) {
    // copying from GPU to CPU memory, this is blocking
    HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
-    HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));  // TODO: still need stream sync? since already blocking
+    HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
  } else {
    // copying between cpu memory
    memcpy(dst_data, src_data, bytes);
@ -57,34 +58,29 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
  auto& dst_device = dst.Location().device;

  if (dst_device.Type() == OrtDevice::GPU) {
-    if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
+    if (src_device.Type() == OrtDevice::CPU) {
      // copy from pinned memory to GPU, this is non-blocking
      HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
    } else if (src_device.Type() == OrtDevice::GPU) {
      // copying between GPU, this is non-blocking
-      // Copy only if the two addresses are different.
      if (dst_data != src_data) {
        HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, static_cast<hipStream_t>(stream.GetHandle())));
      }
-    } else {
-      // copy from other CPU memory to GPU, this is blocking
-      HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
-      HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
    }
  } else if (src_device.Type() == OrtDevice::GPU) {
-    if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
+    if (dst_device.Type() == OrtDevice::CPU) {
      // copying from GPU to pinned memory, this is non-blocking
      HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
-    } else {
-      // copying from GPU to CPU memory, this is blocking
-      HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
-      HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
    }
  } else {
-    // copying between cpu memory
+    if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
+      // sync the stream first to make sure the data arrived
+      HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
+    }
    memcpy(dst_data, src_data, bytes);
  }

  return Status::OK();
 }
+
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/rocm/gpu_data_transfer.h
+++ b/onnxruntime/core/providers/rocm/gpu_data_transfer.h
@ -10,8 +10,8 @@ namespace onnxruntime {

 class GPUDataTransfer : public IDataTransfer {
 public:
-  GPUDataTransfer() = default;
-  ~GPUDataTransfer() = default;
+  GPUDataTransfer();
+  ~GPUDataTransfer();

  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;

--- a/onnxruntime/core/providers/rocm/integer_gemm.cc
+++ b/onnxruntime/core/providers/rocm/integer_gemm.cc
@ -5,13 +5,14 @@
 #include <rocblas/rocblas.h>
 #include "core/providers/rocm/shared_inc/integer_gemm.h"

+#include "core/common/safeint.h"
 #include "core/providers/rocm/rocm_common.h"
 #include "core/providers/rocm/shared_inc/rocm_call.h"

 namespace onnxruntime {
 namespace rocm {

-inline int roundoff(int v, int d) {
+constexpr int roundoff(int v, int d) {
  return (v + d - 1) / d * d;
 }

@ -21,20 +22,21 @@ Status GemmInt8(int m, int n, int k,
                const RocmKernel* rocm_kernel, onnxruntime::Stream* ort_stream) {
  ORT_ENFORCE(a != nullptr && b != nullptr && c != nullptr, "input matrix should not be null");
  ORT_ENFORCE(rocm_kernel != nullptr, "kernel is null");
+  ORT_ENFORCE(ort_stream != nullptr, "Rocm kernel must have the stream instance");

-  hipStream_t stream = ort_stream ? static_cast<hipStream_t>(ort_stream->GetHandle()) : nullptr;
+  hipStream_t stream = static_cast<hipStream_t>(ort_stream->GetHandle());

  // pad A and B to make their leading dimension be multiples of 32
-  // because cublasGemmEx requires:
+  // because rocblas_gemm_ex requires:
  // 1. leading dimension is multiples of 4
  // 2. A, B is 32-bit aligned

-  const int mask = 0x1F;
+  constexpr int mask = 0x1F;
  int lda_aligned = lda;
  IAllocatorUniquePtr<int8_t> a_padded;
  if ((mask & lda_aligned) != 0) {
    lda_aligned = roundoff(lda, 32);
-    a_padded = rocm_kernel->GetScratchBuffer<int8_t>(m * lda_aligned, ort_stream);
+    a_padded = rocm_kernel->GetScratchBuffer<int8_t>(SafeInt<size_t>(m) * lda_aligned, ort_stream);
    HIP_RETURN_IF_ERROR(hipMemcpy2DAsync(a_padded.get(), lda_aligned, a, lda, k, m, hipMemcpyDeviceToDevice, stream));
  }

@ -42,14 +44,15 @@ Status GemmInt8(int m, int n, int k,
  IAllocatorUniquePtr<int8_t> b_padded;
  if ((mask & ldb_aligned) != 0) {
    ldb_aligned = roundoff(ldb, 32);
-    b_padded = rocm_kernel->GetScratchBuffer<int8_t>(k * ldb_aligned, ort_stream);
+    b_padded = rocm_kernel->GetScratchBuffer<int8_t>(SafeInt<size_t>(k) * ldb_aligned, ort_stream);
    HIP_RETURN_IF_ERROR(hipMemcpy2DAsync(b_padded.get(), ldb_aligned, b, ldb, n, k, hipMemcpyDeviceToDevice, stream));
  }

-  RocmStream* ort_rocm_stream = static_cast<RocmStream*>(ort_stream);
-  auto handle = ort_rocm_stream->rocblas_handle_;
+  auto* ort_rocm_stream = dynamic_cast<RocmStream*>(ort_stream);
+  auto rocblas = ort_rocm_stream->rocblas_handle_;
+
  ROCBLAS_RETURN_IF_ERROR(rocblas_gemm_ex(
-      handle,
+      rocblas,
      rocblas_operation_none, rocblas_operation_none,
      n, m, k,
      &alpha,
--- a/onnxruntime/core/providers/rocm/math/einsum.h
+++ b/onnxruntime/core/providers/rocm/math/einsum.h
@ -17,8 +17,7 @@ class Einsum final : public onnxruntime::Einsum {
  Einsum(const OpKernelInfo& info) : onnxruntime::Einsum(info) {
    // We need to cast away the const as PerThreadRocblasHandle() is currently a non-const method
    // TODO: Clean up the ROCMExecutionProvider interface to avoid this
-    rocm_ep_ = const_cast<ROCMExecutionProvider*>(
-        static_cast<const ROCMExecutionProvider*>(info.GetExecutionProvider()));
+    rocm_ep_ = static_cast<const ROCMExecutionProvider*>(info.GetExecutionProvider());
  }

  Status Compute(OpKernelContext* context) const override;
@ -32,7 +31,7 @@ class Einsum final : public onnxruntime::Einsum {
  using onnxruntime::Einsum::equation_;

  // We need to access to the ROCM EP instance to get the rocblas/miopen handles
-  ROCMExecutionProvider* rocm_ep_;
+  const ROCMExecutionProvider* rocm_ep_;
 };

 }  // namespace rocm
--- a/onnxruntime/core/providers/rocm/math/einsum_utils/einsum_auxiliary_ops.h
+++ b/onnxruntime/core/providers/rocm/math/einsum_utils/einsum_auxiliary_ops.h
@ -21,19 +21,18 @@ namespace EinsumOp {
 // Holds ROCM assets required for ROCM ops that need to be executed as part of the Einsum flow
 struct EinsumRocmAssets {
  explicit EinsumRocmAssets(rocblas_handle rocblas_handle,
-                            ROCMExecutionProvider* rocm_ep,
-                            Stream* ort_stream,
-                            AllocatorPtr gpu_allocator) : rocblas_handle_(rocblas_handle),
-                                                          rocm_ep_(rocm_ep),
-                                                          ort_stream_(ort_stream),
-                                                          gpu_allocator_(gpu_allocator) {}
+                            const ROCMExecutionProvider* rocm_ep,
+                            Stream* ort_stream, AllocatorPtr gpu_allocator) : rocblas_handle_(rocblas_handle),
+                                                                              rocm_ep_(rocm_ep),
+                                                                              ort_stream_(ort_stream),
+                                                                              gpu_allocator_(gpu_allocator) {}

  hipStream_t GetRocmStream() {
    return ort_stream_ ? static_cast<hipStream_t>(ort_stream_->GetHandle()) : nullptr;
  }

  rocblas_handle rocblas_handle_;
-  ROCMExecutionProvider* rocm_ep_;
+  const ROCMExecutionProvider* rocm_ep_;
  Stream* ort_stream_;
  AllocatorPtr gpu_allocator_;
 };
--- a/onnxruntime/core/providers/rocm/math/softmax.cc
+++ b/onnxruntime/core/providers/rocm/math/softmax.cc
@ -29,20 +29,23 @@ Status SoftMaxComputeHelper(
  auto X_data = reinterpret_cast<const HipT_IN*>(X);

  if (D <= 1024 && D * sizeof(T) <= 4096) {
-    return dispatch_warpwise_softmax_forward<HipT_IN, HipT_OUT, AccumulationType_t<HipT_ACCUM>, IsLogSoftmax>(
-        stream, Y_data, X_data, gsl::narrow_cast<int>(D),
-        gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N), tuning_ctx);
+    return dispatch_warpwise_softmax_forward<
+        HipT_IN, HipT_OUT, AccumulationType_t<HipT_ACCUM>, IsLogSoftmax>(
+        stream, Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N), tuning_ctx);
  }
+
  return dispatch_blockwise_softmax_forward<HipT_IN, HipT_OUT, AccumulationType_t<HipT_ACCUM>, IsLogSoftmax>(
-      stream, Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D),
-      gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N), tuning_ctx);
+      stream, Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D),
+      gsl::narrow_cast<int>(N), tuning_ctx);
 }

-#define SPECIALIZED_SOFTMAX_HELPER_IMPL(T, TOut)                                                                           \
-  template Status SoftMaxComputeHelper<T, TOut, false>(Stream * stream, const T* input, const TensorShape& shape, TOut* Y, \
-                                                       int64_t axis, RocmTuningContext* tuning_ctx);                       \
-  template Status SoftMaxComputeHelper<T, TOut, true>(Stream * stream, const T* input, const TensorShape& shape, TOut* Y,  \
-                                                      int64_t axis, RocmTuningContext* tuning_ctx);
+#define SPECIALIZED_SOFTMAX_HELPER_IMPL(T, TOut)                                                        \
+  template Status SoftMaxComputeHelper<T, TOut, false>(Stream * stream, const T* input,                 \
+                                                       const TensorShape& shape, TOut* Y, int64_t axis, \
+                                                       RocmTuningContext* tuning_ctx);                  \
+  template Status SoftMaxComputeHelper<T, TOut, true>(Stream * stream, const T* input,                  \
+                                                      const TensorShape& shape, TOut* Y, int64_t axis,  \
+                                                      RocmTuningContext* tuning_ctx);

 SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16, float)
 SPECIALIZED_SOFTMAX_HELPER_IMPL(float, float)
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@ -44,14 +44,13 @@ const miopenConvFwdAlgorithm_t Conv<T, NHWC>::kAllAlgos[] = {
    miopenConvolutionFwdAlgoWinograd,
    miopenConvolutionFwdAlgoImplicitGEMM};

-miopenStatus_t GetWorkspaceSize(miopenHandle_t handle, const MiopenConvState<miopenConvAlgoPerf_t>& s,
-                                miopenConvFwdAlgorithm_t algo, size_t* sz) {
+miopenStatus_t GetWorkspaceSize(miopenHandle_t handle, const MiopenConvState<miopenConvAlgoPerf_t>& s, miopenConvFwdAlgorithm_t algo, size_t* sz) {
  return miopenConvolutionForwardGetWorkSpaceSize(handle, s.w_desc, s.x_tensor, s.conv_desc, s.y_tensor, sz);
 }

 size_t GetMaxWorkspaceSize(miopenHandle_t handle, const MiopenConvState<miopenConvAlgoPerf_t>& s,
                           const miopenConvFwdAlgorithm_t* algo, int n_algo) {
-  // TODO: get maximum available size from memory arean
+  // TODO: get maximum available size from memory arena
  size_t free, total;
  HIP_CALL_THROW(hipMemGetInfo(&free, &total));
  // Assuming 10% of fragmentation
@ -68,8 +67,7 @@ size_t GetMaxWorkspaceSize(miopenHandle_t handle, const MiopenConvState<miopenCo
 }

 Status SliceOutUnwantedOutputSection(hipStream_t stream,
-                                     const void* input_data,
-                                     const gsl::span<const int64_t>& input_dims,
+                                     const void* input_data, gsl::span<const int64_t> input_dims,
                                     void* output_data,
                                     const gsl::span<const int64_t>& output_dims,
                                     const gsl::span<const int64_t>& starts,
@ -103,8 +101,7 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
  // Make sure input and weight are 4D for NHWC since we set 4D descriptor for NHWC.
  constexpr bool channels_last = NHWC;
  if (channels_last && (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Number of dimensions of X and W should be 4 for channels_last format (NHWC)");
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Number of dimensions of X and W should be 4 for channels_last format (NHWC)");
  }

  // set B
@ -140,7 +137,7 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)

    const size_t kernel_rank = kernel_shape.size();

-    ConvAttributes::ConvPadVector pads(conv_attrs_.pads);
+    ConvPadVector pads(conv_attrs_.pads);
    if (pads.empty()) {
      pads.resize(kernel_rank * 2, 0);
    }
@ -174,7 +171,7 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
    TensorShapeVector slice_axes;
    slice_axes.reserve(kernel_rank);

-    const size_t spatial_dim_start = channels_last ? 1 : 2;
+    constexpr size_t spatial_dim_start = channels_last ? 1 : 2;
    const size_t spatial_dim_end = spatial_dim_start + kernel_rank;
    TensorShape spatial_shape = X->Shape().Slice(spatial_dim_start, spatial_dim_end);

@ -183,7 +180,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
                                                                     strides, dilations, pads, y_dims, y_dims_with_adjusted_pads,
                                                                     post_slicing_required, slice_starts, slice_ends, slice_axes,
                                                                     channels_last));
-
    if (channels_last) {
      y_dims.push_back(M);
      y_dims_with_adjusted_pads.push_back(M);
@ -198,9 +194,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
    s_.slice_axes = slice_axes;

    s_.Y = context->Output(0, TensorShape(s_.y_dims));
-    if (s_.Y->Shape().Size() == 0) {
-      return Status::OK();
-    }
    if (post_slicing_required) {
      // Post slicing needed. Create and fill in the Conv results in an intermediate buffer.
      s_.memory_for_miopen_conv_results = GetScratchBuffer<void>(TensorShape(y_dims_with_adjusted_pads).Size() * s_.element_size, context->GetComputeStream());
@ -225,18 +218,23 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
    }

    if (w_dims_changed) {
-      if (channels_last) {
+      if (!channels_last) {
+        ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, MiopenTensor::GetDataType<HipT>()));
+      } else {
        ORT_RETURN_IF_ERROR(s_.w_desc.Set(MiopenTensor::GetDataType<HipT>(),
                                          miopenTensorNHWC,
                                          w_dims[0],
                                          w_dims[3],
                                          w_dims[1],
                                          w_dims[2]));
-      } else {
-        ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, MiopenTensor::GetDataType<HipT>()));
      }
    }

+    // We must delay returning early until here so that the weight dims have been cached properly
+    if (s_.Y->Shape().Size() == 0) {
+      return Status::OK();
+    }
+
    if (channels_last) {
      ORT_RETURN_IF_ERROR(s_.x_tensor.Set(MiopenTensor::GetDataType<HipT>(),
                                          miopenTensorNHWC,
@ -357,7 +355,7 @@ Status Conv<T, NHWC>::ComputeInternal(OpKernelContext* context) const {
  // To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions
  // This may have lead to extra results that are unnecessary and hence we slice that off here
  if (s_.post_slicing_required) {
-    ORT_RETURN_IF_ERROR(SliceOutUnwantedOutputSection(Stream(context), s_.y_data, s_.y_dims_with_adjusted_pads,
+    ORT_RETURN_IF_ERROR(SliceOutUnwantedOutputSection(Stream(context), s_.y_data, gsl::make_span(s_.y_dims_with_adjusted_pads),
                                                      s_.Y->MutableDataRaw(), s_.y_dims.GetDims(), s_.slice_starts,
                                                      s_.slice_ends, s_.slice_axes, s_.element_size));
  }
@ -384,18 +382,18 @@ MiopenConvolutionDescriptor::~MiopenConvolutionDescriptor() {

 Status MiopenConvolutionDescriptor::Set(
    size_t rank,
-    gsl::span<const int64_t> pads,
-    gsl::span<const int64_t> strides,
-    gsl::span<const int64_t> dilations,
+    const gsl::span<const int64_t>& pads,
+    const gsl::span<const int64_t>& strides,
+    const gsl::span<const int64_t>& dilations,
    int groups,
    miopenConvolutionMode_t mode,
    miopenDataType_t data_type) {
  if (!desc_)
    MIOPEN_RETURN_IF_ERROR(miopenCreateConvolutionDescriptor(&desc_));

-  InlinedVector<int> pad_dims(rank);
-  InlinedVector<int> stride_dims(rank);
-  InlinedVector<int> dilation_dims(rank);
+  InlinedVector<int, kTensorShapeSmallBufferElementsSize> pad_dims(rank);
+  InlinedVector<int, kTensorShapeSmallBufferElementsSize> stride_dims(rank);
+  InlinedVector<int, kTensorShapeSmallBufferElementsSize> dilation_dims(rank);
  for (size_t i = 0; i < rank; i++) {
    pad_dims[i] = gsl::narrow_cast<int>(pads[i]);
    stride_dims[i] = gsl::narrow_cast<int>(strides[i]);
--- a/onnxruntime/core/providers/rocm/nn/conv.h
+++ b/onnxruntime/core/providers/rocm/nn/conv.h
@ -10,6 +10,9 @@
 #include <list>

 namespace onnxruntime {
+
+using ConvPadVector = ConvAttributes::ConvPadVector;
+
 namespace rocm {

 class MiopenConvolutionDescriptor final {
@ -18,9 +21,9 @@ class MiopenConvolutionDescriptor final {
  ~MiopenConvolutionDescriptor();

  Status Set(size_t rank,
-             gsl::span<const int64_t> pads,
-             gsl::span<const int64_t> strides,
-             gsl::span<const int64_t> dilations,
+             const gsl::span<const int64_t>& pads,
+             const gsl::span<const int64_t>& strides,
+             const gsl::span<const int64_t>& dilations,
             int groups,
             miopenConvolutionMode_t mode,
             miopenDataType_t data_type);
@ -198,7 +201,7 @@ class Conv : public RocmKernel {

 Status SliceOutUnwantedOutputSection(hipStream_t stream,
                                     const void* input_data,
-                                     const gsl::span<const int64_t>& input_dims,
+                                     gsl::span<const int64_t> input_dims,
                                     void* output_data,
                                     const gsl::span<const int64_t>& output_dims,
                                     const gsl::span<const int64_t>& starts,
--- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@ -8,6 +8,9 @@
 #include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
 #include "core/providers/rocm/math/binary_elementwise_ops.h"
 #include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
+#ifdef ENABLE_TRAINING
+#include "contrib_ops/cpu/aten_ops/aten_op.h"
+#endif

 using namespace onnxruntime::common;
 namespace onnxruntime {
@ -100,8 +103,8 @@ namespace rocm {
      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
      name<T>);

-// ROCM ArgMax/ArgMin doesn't have OpSet12 implementation (with select_last_index attr), keep it in OpSet11 for now.
-#define REGISTER_KERNEL_TYPED_11(name, T)                                                  \
+// ROCM ArgMax/ArgMin doesn't have OpSet12+ implementation (with select_last_index attr) yet
+#define REGISTER_KERNEL_VERSIONED_TYPED_11(name, T)                                        \
  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
      name,                                                                                \
      kOnnxDomain,                                                                         \
@ -110,10 +113,10 @@ namespace rocm {
      kRocmExecutionProvider,                                                              \
      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
      name<T>);                                                                            \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
      name,                                                                                \
      kOnnxDomain,                                                                         \
-      11,                                                                                  \
+      11, 11,                                                                              \
      T,                                                                                   \
      kRocmExecutionProvider,                                                              \
      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
@ -166,7 +169,6 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
  const auto rank = input_shape.NumDimensions();

  auto hip_stream = stream ? static_cast<hipStream_t>(stream->GetHandle()) : nullptr;
-
  // Block of fast matrix reduction.
  if (fast_reduction_) {
    int m{}, n{};
@ -210,10 +212,8 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
    ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
  else
    ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
-
  const auto one = ReduceConsts<HipT>::One;
  const auto zero = ReduceConsts<HipT>::Zero;
-
  MiopenTensor input_tensor;
  MiopenTensor output_tensor;
  ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
@ -444,17 +444,18 @@ template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices>
 Status ReduceComputeCore(const AllocatorPtr& gpu_allocator, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
                         /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op,
                         gsl::span<const int64_t> axes,
-                         bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, Stream* ort_stream,
+                         bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
+                         Stream* ort_stream,
                         const TensorShape* input_shape_override) {
  typedef typename ToHipType<T>::MappedType HipT;
  const TensorShape& input_shape = input_shape_override ? *input_shape_override : input.Shape();
+  hipStream_t stream = ort_stream ? static_cast<hipStream_t>(ort_stream->GetHandle()) : nullptr;

  int64_t input_count = prepare_reduce_metadata.input_count;
  int64_t output_count = prepare_reduce_metadata.output_count;
  auto& output_dims = prepare_reduce_metadata.output_dims;
  auto& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen;
  auto& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen;
-  hipStream_t stream = ort_stream ? static_cast<hipStream_t>(ort_stream->GetHandle()) : nullptr;
  // special case when there is a dim value of 0 in the shape.
  if (input_count == 0) {
    assert(output.Shape().Size() == 0);
@ -540,7 +541,6 @@ Status ReduceComputeCore(const AllocatorPtr& gpu_allocator, const Tensor& input,

  const auto one = ReduceConsts<HipT>::One;
  const auto zero = ReduceConsts<HipT>::Zero;
-
  MiopenTensor input_tensor;
  MiopenTensor output_tensor;
  ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
@ -588,11 +588,12 @@ Status ReduceComputeCore(const AllocatorPtr& gpu_allocator, const Tensor& input,
        MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(RocmKernel::GetMiopenHandle(rocm_stream), reduce_max_desc,
                                                             input_tensor, output_tensor, &indices_bytes_max));
        auto indices_rocm_max = indices_bytes == 0 ? nullptr : IAllocator::MakeUniquePtr<uint32_t>(gpu_allocator, indices_bytes, false, ort_stream, WaitRocmNotificationOnDevice);
+        auto* p_output = reinterpret_cast<HipT*>(output.template MutableData<T>());
        MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(
            RocmKernel::GetMiopenHandle(rocm_stream), reduce_max_desc, indices_rocm_max.get(), indices_bytes_max,
            workspace_rocm.get(), workspace_bytes,
            &one, input_tensor, reinterpret_cast<const HipT*>(input.Data<T>()),
-            &zero, output_tensor, reinterpret_cast<HipT*>(output.MutableData<T>())));
+            &zero, output_tensor, p_output));
      }

      // Exp(X-ReduceMax)
@ -652,11 +653,12 @@ Status ReduceComputeCore(const AllocatorPtr& gpu_allocator, const Tensor& input,
      if (input_count == output_count) {
        HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast<HipT*>(output.MutableData<T>()), input_data, input_count * sizeof(T), hipMemcpyDeviceToDevice, stream));
      } else {
+        auto* p_output = reinterpret_cast<HipT*>(output.template MutableData<T>());
        MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(
            RocmKernel::GetMiopenHandle(rocm_stream), reduce_desc, indices_rocm.get(), indices_bytes,
            workspace_rocm.get(), workspace_bytes,
            &one, input_tensor, input_data,
-            &zero, output_tensor, reinterpret_cast<HipT*>(output.MutableData<T>())));
+            &zero, output_tensor, p_output));
      }
    } else {
      // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case
@ -675,11 +677,12 @@ Status ReduceComputeCore(const AllocatorPtr& gpu_allocator, const Tensor& input,

          Impl_Cast<float, HipT>(stream, temp_output.get(), reinterpret_cast<HipT*>(output.MutableData<T>()), output_count);
        } else {
+          auto* p_output = reinterpret_cast<HipT*>(output.template MutableData<T>());
          MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(
              RocmKernel::GetMiopenHandle(rocm_stream), reduce_desc, indices_rocm.get(), indices_bytes,
              workspace_rocm.get(), workspace_bytes,
              &one, input_tensor, reinterpret_cast<const HipT*>(input.Data<T>()),
-              &zero, output_tensor, reinterpret_cast<HipT*>(output.MutableData<T>())));
+              &zero, output_tensor, p_output));
        }
      }
    }
@ -743,18 +746,29 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, miopenR
  // empty axes and no-op
  if (axes.empty() && noop_with_empty_axes_) {
    auto* Y = ctx->Output(0, X->Shape());
-    HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->MutableData<T>(), X->Data<T>(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream(ctx)));
+    HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->MutableData<T>(), X->Data<T>(), X->SizeInBytes(),
+                                       hipMemcpyDeviceToDevice, Stream(ctx)));
    return Status::OK();
  }

+#ifdef ENABLE_TRAINING
+  // Use ATen for ReduceSum if possible.
+  const TensorShape& input_shape = X->Shape();
+  if (contrib::IsATenOperatorExecutorInitialized() && miopen_reduce_op == MIOPEN_REDUCE_TENSOR_ADD && !calculate_log_ &&
+      !calculate_sqt_ && !log_sum_exp_ && input_shape.Size() > 0) {
+    if (axes.empty()) {
+      axes.resize(input_shape.NumDimensions());
+      std::iota(axes.begin(), axes.end(), 0);
+    }
+    ORT_RETURN_IF_ERROR(contrib::ExecuteReduceSumATen(ctx, axes, keepdims_));
+    return Status::OK();
+  }
+#endif
+
  PrepareReduceMetadata prepare_reduce_metadata;
-  ORT_RETURN_IF_ERROR(PrepareForReduce(X,
-                                       keepdims_,
-                                       axes,
-                                       prepare_reduce_metadata));
+  ORT_RETURN_IF_ERROR(PrepareForReduce(X, keepdims_, axes, prepare_reduce_metadata));
  Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
  const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute();
-
  return ReduceComputeCore<T, ReduceTensorIndices>(Info().GetAllocator(OrtMemType::OrtMemTypeDefault), *X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes,
                                                   calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction, ctx->GetComputeStream());
 }
@ -837,7 +851,6 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, miopenR
    MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(GetMiopenHandle(ctx), reduce_desc, indices_rocm.get(), indices_bytes,         \
                                              workspace_rocm.get(), workspace_bytes, &one, input_tensor, temp_X.get(),      \
                                              &zero, output_tensor, temp_Y.get()));                                         \
-                                                                                                                            \
    Impl_Cast<float, HipT>(Stream(ctx), temp_Y.get(), reinterpret_cast<HipT*>(Y->MutableData<T>()), output_count);          \
                                                                                                                            \
    return Status::OK();                                                                                                    \
@ -909,13 +922,13 @@ template std::unique_ptr<Tensor> ReduceCompute<MLFloat16, MIOPEN_REDUCE_TENSOR_N
  REGISTER_KERNEL_TYPED(name, BFloat16)
 // REGISTER_KERNEL_TYPED(name, double)

-#define REGISTER_KERNEL_HFD_11(name)        \
-  REGISTER_KERNEL_TYPED_11(name, MLFloat16) \
-  REGISTER_KERNEL_TYPED_11(name, float)
-// REGISTER_KERNEL_TYPED_11(name, double)
+#define REGISTER_KERNEL_HFD_VERSIONED_11(name)        \
+  REGISTER_KERNEL_VERSIONED_TYPED_11(name, MLFloat16) \
+  REGISTER_KERNEL_VERSIONED_TYPED_11(name, float)
+// REGISTER_KERNEL_VERSIONED_TYPED_11(name, double)

-REGISTER_KERNEL_HFD_11(ArgMax)
-REGISTER_KERNEL_HFD_11(ArgMin)
+REGISTER_KERNEL_HFD_VERSIONED_11(ArgMax)
+REGISTER_KERNEL_HFD_VERSIONED_11(ArgMin)
 REGISTER_KERNEL_HFD(ReduceL1)
 REGISTER_KERNEL_HFD(ReduceL2)

--- a/onnxruntime/core/providers/rocm/rocm_allocator.cc
+++ b/onnxruntime/core/providers/rocm/rocm_allocator.cc
@ -51,9 +51,8 @@ void* ROCMAllocator::Alloc(size_t size) {

 void ROCMAllocator::Free(void* p) {
  SetDevice(false);
-  CheckDevice(false);  // ignore ROCM failure when free
-  // do not throw error since it's OK for hipFree to fail during shutdown; void to silence nodiscard
-  (void)hipFree(p);
+  CheckDevice(false);                   // ignore ROCM failure when free
+  ORT_IGNORE_RETURN_VALUE(hipFree(p));  // do not throw error since it's OK for hipFree to fail during shutdown
 }

 void* ROCMExternalAllocator::Alloc(size_t size) {
--- a/onnxruntime/core/providers/rocm/rocm_allocator.h
+++ b/onnxruntime/core/providers/rocm/rocm_allocator.h
@ -3,7 +3,6 @@

 #pragma once

-#include <unordered_set>
 #include "core/common/inlined_containers.h"
 #include "core/framework/allocator.h"
 #include "core/platform/ort_mutex.h"
@ -56,7 +55,7 @@ class ROCMPinnedAllocator : public IAllocator {
  ROCMPinnedAllocator(const char* name)
      : IAllocator(
            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, 0),
+                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, 0 /*CPU device always with id 0*/),
                          0, OrtMemTypeCPUOutput)) {}

  void* Alloc(size_t size) override;
--- a/onnxruntime/core/providers/rocm/rocm_call.cc
+++ b/onnxruntime/core/providers/rocm/rocm_call.cc
@ -39,11 +39,11 @@ const char* RocmErrString<rocblas_status>(rocblas_status e) {
    CASE_ENUM_TO_STR(rocblas_status_invalid_handle);
    CASE_ENUM_TO_STR(rocblas_status_not_implemented);
    CASE_ENUM_TO_STR(rocblas_status_invalid_pointer);
+    CASE_ENUM_TO_STR(rocblas_status_size_query_mismatch);
    CASE_ENUM_TO_STR(rocblas_status_invalid_size);
    CASE_ENUM_TO_STR(rocblas_status_memory_error);
    CASE_ENUM_TO_STR(rocblas_status_internal_error);
    CASE_ENUM_TO_STR(rocblas_status_perf_degraded);
-    CASE_ENUM_TO_STR(rocblas_status_size_query_mismatch);
    CASE_ENUM_TO_STR(rocblas_status_size_increased);
    CASE_ENUM_TO_STR(rocblas_status_size_unchanged);
    CASE_ENUM_TO_STR(rocblas_status_invalid_value);
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@ -36,11 +36,11 @@ class ROCMExecutionProvider : public IExecutionProvider {
    return nullptr;
  }

-  rocblas_handle PerThreadRocblasHandle() {
+  rocblas_handle PerThreadDefaultRocblasHandle() {
    return GetPerThreadContext().RocblasHandle();
  }

-  miopenHandle_t PerThreadMiopenHandle() {
+  miopenHandle_t PerThreadDefaultMiopenHandle() {
    return GetPerThreadContext().MiopenHandle();
  }

@ -60,7 +60,6 @@ class ROCMExecutionProvider : public IExecutionProvider {
  const hipDeviceProp_t& GetDeviceProp() const { return device_prop_; };
  int GetMiopenConvExhaustiveSearch() const { return info_.miopen_conv_exhaustive_search; }
  bool DoCopyOnDefaultStream() const { return info_.do_copy_in_default_stream; }
-
  bool GetMiopenConvUseMaxWorkspace() const { return info_.miopen_conv_use_max_workspace; }

  ProviderOptions GetProviderOptions() const override {
@ -68,15 +67,15 @@ class ROCMExecutionProvider : public IExecutionProvider {
  }

  static AllocatorPtr CreateRocmAllocator(OrtDevice::DeviceId device_id, size_t rocm_mem_limit, ArenaExtendStrategy arena_extend_strategy,
-                                          ROCMExecutionProviderExternalAllocatorInfo external_alloc_info, OrtArenaCfg* arena_cfg);
+                                          ROCMExecutionProviderExternalAllocatorInfo external_alloc_info, const OrtArenaCfg* arena_cfg);

  ITuningContext* GetTuningContext() const override;

  std::unique_ptr<profiling::EpProfiler> GetProfiler() override;

  void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
-  std::vector<AllocatorPtr> CreatePreferredAllocators() override;
  OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
+  std::vector<AllocatorPtr> CreatePreferredAllocators() override;

 private:
  ROCMExecutionProviderInfo info_;
@ -105,21 +104,30 @@ class ROCMExecutionProvider : public IExecutionProvider {

    template <typename T>
    const T* GetConstOnes(size_t count, hipStream_t stream) {
-      if (std::is_same<T, float>::value) {
+      constexpr bool is_float = std::is_same<T, float>::value;
+      constexpr bool is_double = std::is_same<T, double>::value;
+      constexpr bool is_half = std::is_same<T, half>::value;
+      constexpr bool is_BFloat16 = std::is_same<T, BFloat16>::value;
+      if (is_float) {
        if (!constant_ones_float_) {
          constant_ones_float_ = rocm::CreateConstantOnes<float>();
        }
        return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(stream, count));
-      } else if (std::is_same<T, double>::value) {
+      } else if (is_double) {
        if (!constant_ones_double_) {
          constant_ones_double_ = rocm::CreateConstantOnes<double>();
        }
        return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(stream, count));
-      } else if (std::is_same<T, half>::value) {
+      } else if (is_half) {
        if (!constant_ones_half_) {
          constant_ones_half_ = rocm::CreateConstantOnes<half>();
        }
        return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(stream, count));
+      } else if (is_BFloat16) {
+        if (!constant_ones_bfloat16_) {
+          constant_ones_bfloat16_ = rocm::CreateConstantOnes<BFloat16>();
+        }
+        return reinterpret_cast<const T*>(constant_ones_bfloat16_->GetBuffer(stream, count));
      } else {
        return nullptr;
      }
@ -132,6 +140,7 @@ class ROCMExecutionProvider : public IExecutionProvider {
    std::unique_ptr<rocm::IConstantBuffer<float>> constant_ones_float_;
    std::unique_ptr<rocm::IConstantBuffer<double>> constant_ones_double_;
    std::unique_ptr<rocm::IConstantBuffer<half>> constant_ones_half_;
+    std::unique_ptr<rocm::IConstantBuffer<BFloat16>> constant_ones_bfloat16_;
  };

  using PerThreadContextMap = std::unordered_map<const ROCMExecutionProvider*, std::weak_ptr<PerThreadContext>>;
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc
@ -27,12 +27,10 @@ constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_dur
 }  // namespace provider_option_names
 }  // namespace rocm

-namespace {
 const EnumNameMapping<ArenaExtendStrategy> arena_extend_strategy_mapping{
    {ArenaExtendStrategy::kNextPowerOfTwo, "kNextPowerOfTwo"},
    {ArenaExtendStrategy::kSameAsRequested, "kSameAsRequested"},
 };
-}  // namespace

 ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
  ROCMExecutionProviderInfo info{};
@ -81,7 +79,9 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
          .AddAssignmentToEnumReference(
              rocm::provider_option_names::kArenaExtendStrategy,
              arena_extend_strategy_mapping, info.arena_extend_strategy)
-          .AddAssignmentToReference(rocm::provider_option_names::kMiopenConvExhaustiveSearch, info.miopen_conv_exhaustive_search)
+          .AddAssignmentToReference(
+              rocm::provider_option_names::kMiopenConvExhaustiveSearch,
+              info.miopen_conv_exhaustive_search)
          .AddAssignmentToReference(rocm::provider_option_names::kDoCopyInDefaultStream, info.do_copy_in_default_stream)
          .AddAssignmentToReference(rocm::provider_option_names::kMiopenConvUseMaxWorkspace, info.miopen_conv_use_max_workspace)
          .AddValueParser(
--- a/onnxruntime/core/providers/rocm/rocm_fwd.h
+++ b/onnxruntime/core/providers/rocm/rocm_fwd.h
@ -1,13 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/framework/op_kernel.h"
-
-namespace onnxruntime {
-namespace rocm {
-template <typename T>
-KernelCreateInfo BuildKernelCreateInfo();
-}
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/rocm/rocm_kernel.h
+++ b/onnxruntime/core/providers/rocm/rocm_kernel.h
@ -35,14 +35,12 @@ class RocmKernel : public OpKernel {
    // use this to precisely locate the node where ROCM failure comes from
    //  if (hipSuccess != hipDeviceSynchronize())
    //    __debugbreak();
-
    if (s.IsOK()) {
      auto err = hipGetLastError();
      if (err != hipSuccess) {
-        s = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "HIP error ", hipGetErrorName(err), ":", hipGetErrorString(err));
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "HIP error ", hipGetErrorName(err), ":", hipGetErrorString(err));
      }
    }
-
    return s;
  }

@ -64,18 +62,18 @@ class RocmKernel : public OpKernel {
    return IAllocator::MakeUniquePtr<T>(Info().GetAllocator(OrtMemType::OrtMemTypeDefault), count_or_bytes, true);
  }

-  template <typename T>
-  inline IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
-    if (count_or_bytes == 0) return nullptr;
-    return IAllocator::MakeUniquePtr<T>(Info().GetAllocator(OrtMemType::OrtMemTypeCPU), count_or_bytes);
-  }
-
  inline void AddDeferredReleaseCPUPtr(void* p, onnxruntime::Stream* ort_stream) const {
    ORT_ENFORCE(ort_stream->GetDevice().Type() == OrtDevice::GPU);
    auto* rocm_ep_stream = static_cast<RocmStream*>(ort_stream);
    rocm_ep_stream->EnqueDeferredCPUBuffer(p);
  }

+  template <typename T>
+  inline IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
+    if (count_or_bytes == 0) return nullptr;
+    return IAllocator::MakeUniquePtr<T>(Info().GetAllocator(OrtMemType::OrtMemTypeCPU), count_or_bytes);
+  }
+
  const hipDeviceProp_t& GetDeviceProp() const { return provider_->GetDeviceProp(); }

  inline hipStream_t Stream(OpKernelContext* ctx) const {
@ -83,6 +81,22 @@ class RocmKernel : public OpKernel {
    return stream ? static_cast<hipStream_t>(stream->GetHandle()) : nullptr;
  }

+  inline miopenHandle_t GetMiopenHandle(OpKernelContext* ctx) const {
+    return GetMiopenHandle(static_cast<RocmStream*>(ctx->GetComputeStream()));
+  }
+
+  static inline miopenHandle_t GetMiopenHandle(onnxruntime::RocmStream* stream) {
+    return stream->miopen_handle_;
+  }
+
+  inline rocblas_handle GetRocblasHandle(OpKernelContext* ctx) const {
+    return GetRocblasHandle(static_cast<RocmStream*>(ctx->GetComputeStream()));
+  }
+
+  static inline rocblas_handle GetRocblasHandle(onnxruntime::RocmStream* stream) {
+    return stream->rocblas_handle_;
+  }
+
  tunable::RocmTuningContext* GetTuningContext() const {
    return static_cast<tunable::RocmTuningContext*>(provider_->GetTuningContext());
  }
@ -106,7 +120,7 @@ class RocmKernel : public OpKernel {
      }
    }

-    RocmAsyncBuffer(const RocmKernel* op_kernel, gsl::span<const T> vec) : RocmAsyncBuffer(op_kernel, vec.size()) {
+    RocmAsyncBuffer(const RocmKernel* op_kernel, gsl::span<T const> vec) : RocmAsyncBuffer(op_kernel, vec.size()) {
      memcpy(CpuPtr(), vec.data(), vec.size() * sizeof(T));
    }

@ -151,28 +165,12 @@ class RocmKernel : public OpKernel {
    const RocmKernel* op_kernel_;
  };

-  inline rocblas_handle RocblasHandle() const {
-    return provider_->PerThreadRocblasHandle();
+  inline rocblas_handle DefaultRocblasHandle() const {
+    return provider_->PerThreadDefaultRocblasHandle();
  }

-  inline miopenHandle_t MiopenHandle() const {
-    return provider_->PerThreadMiopenHandle();
-  }
-
-  static inline rocblas_handle GetRocblasHandle(onnxruntime::RocmStream* stream) {
-    return stream->rocblas_handle_;
-  }
-
-  inline rocblas_handle GetRocblasHandle(OpKernelContext* ctx) const {
-    return GetRocblasHandle(static_cast<RocmStream*>(ctx->GetComputeStream()));
-  }
-
-  static inline miopenHandle_t GetMiopenHandle(onnxruntime::RocmStream* stream) {
-    return stream->miopen_handle_;
-  }
-
-  inline miopenHandle_t GetMiopenHandle(OpKernelContext* ctx) const {
-    return GetMiopenHandle(static_cast<RocmStream*>(ctx->GetComputeStream()));
+  inline miopenHandle_t DefaultMiopenHandle() const {
+    return provider_->PerThreadDefaultMiopenHandle();
  }

 protected:
--- a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc
+++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc
@ -3,15 +3,13 @@

 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/rocm/rocm_provider_factory.h"
-
-#include <memory>
+#include "core/providers/rocm/rocm_provider_factory_creator.h"

 #include "core/common/gsl.h"

 #include "core/providers/rocm/rocm_execution_provider.h"
 #include "core/providers/rocm/rocm_execution_provider_info.h"
 #include "core/providers/rocm/rocm_allocator.h"
-#include "core/providers/rocm/rocm_provider_factory_creator.h"
 #include "core/providers/rocm/gpu_data_transfer.h"
 #include "core/providers/rocm/math/unary_elementwise_ops_impl.h"

@ -47,7 +45,7 @@ std::unique_ptr<IExecutionProvider> ROCMProviderFactory::CreateProvider() {
  return std::make_unique<ROCMExecutionProvider>(info_);
 }

-struct ProviderInfo_ROCM_Impl : ProviderInfo_ROCM {
+struct ProviderInfo_ROCM_Impl final : ProviderInfo_ROCM {
  OrtStatus* SetCurrentGpuDeviceId(_In_ int device_id) override {
    int num_devices;
    auto hip_err = ::hipGetDeviceCount(&num_devices);
@ -128,9 +126,24 @@ struct ProviderInfo_ROCM_Impl : ProviderInfo_ROCM {
  }

  // Used by slice_concatenate_test.cc and onnxruntime_pybind_state.cc
-  void rocmMemcpy_HostToDevice(void* dst, const void* src, size_t count) override { HIP_CALL_THROW(hipMemcpy(dst, src, count, hipMemcpyHostToDevice)); }
+
+  void rocmMemcpy_HostToDevice(void* dst, const void* src, size_t count) override {
+    // hipMemcpy() operates on the default stream
+    HIP_CALL_THROW(hipMemcpy(dst, src, count, hipMemcpyHostToDevice));
+
+    // To ensure that the copy has completed, invoke a stream sync for the default stream.
+    // For transfers from pageable host memory to device memory, a stream sync is performed before the copy is initiated.
+    // The function will return once the pageable buffer has been copied to the staging memory for DMA transfer
+    // to device memory, but the DMA to final destination may not have completed.
+
+    HIP_CALL_THROW(hipStreamSynchronize(0));
+  }
+
  // Used by onnxruntime_pybind_state.cc
-  void rocmMemcpy_DeviceToHost(void* dst, const void* src, size_t count) override { HIP_CALL_THROW(hipMemcpy(dst, src, count, hipMemcpyDeviceToHost)); }
+  void rocmMemcpy_DeviceToHost(void* dst, const void* src, size_t count) override {
+    // For transfers from device to either pageable or pinned host memory, the function returns only once the copy has completed.
+    HIP_CALL_THROW(hipMemcpy(dst, src, count, hipMemcpyDeviceToHost));
+  }

  int hipGetDeviceCount() override {
    int num_devices = 0;
@ -152,10 +165,9 @@ struct ProviderInfo_ROCM_Impl : ProviderInfo_ROCM {
    return std::make_shared<ROCMProviderFactory>(info);
  }

-  std::shared_ptr<IAllocator> CreateRocmAllocator(int16_t device_id, size_t gpu_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::ROCMExecutionProviderExternalAllocatorInfo& external_allocator_info, OrtArenaCfg* default_memory_arena_cfg) override {
+  std::shared_ptr<IAllocator> CreateRocmAllocator(int16_t device_id, size_t gpu_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::ROCMExecutionProviderExternalAllocatorInfo& external_allocator_info, const OrtArenaCfg* default_memory_arena_cfg) override {
    return ROCMExecutionProvider::CreateRocmAllocator(device_id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, default_memory_arena_cfg);
  }
-
 } g_info;

 struct ROCM_Provider : Provider {
@ -169,8 +181,8 @@ struct ROCM_Provider : Provider {
    info.gpu_mem_limit = params->gpu_mem_limit;
    info.arena_extend_strategy = static_cast<onnxruntime::ArenaExtendStrategy>(params->arena_extend_strategy);
    info.miopen_conv_exhaustive_search = params->miopen_conv_exhaustive_search;
-    info.do_copy_in_default_stream = params->do_copy_in_default_stream;
-    info.has_user_compute_stream = params->has_user_compute_stream;
+    info.do_copy_in_default_stream = params->do_copy_in_default_stream != 0;
+    info.has_user_compute_stream = params->has_user_compute_stream != 0;
    info.user_compute_stream = params->user_compute_stream;
    info.default_memory_arena_cfg = params->default_memory_arena_cfg;
    info.tunable_op.enable = params->tunable_op_enable;
@ -180,21 +192,32 @@ struct ROCM_Provider : Provider {
    return std::make_shared<ROCMProviderFactory>(info);
  }

+  /**
+   * This function will be called by the C API UpdateROCMProviderOptions().
+   *
+   * What this function does is equivalent to resetting the OrtROCMProviderOptions instance with
+   * default ROCMExecutionProviderInf instance first and then set up the provided provider options.
+   * See ROCMExecutionProviderInfo::FromProviderOptions() for more details.
+   */
  void UpdateProviderOptions(void* provider_options, const ProviderOptions& options) override {
-    auto info = onnxruntime::ROCMExecutionProviderInfo::FromProviderOptions(options);
+    auto internal_options = onnxruntime::ROCMExecutionProviderInfo::FromProviderOptions(options);
    auto& rocm_options = *reinterpret_cast<OrtROCMProviderOptions*>(provider_options);

-    rocm_options.device_id = info.device_id;
-    rocm_options.gpu_mem_limit = info.gpu_mem_limit;
-    rocm_options.arena_extend_strategy = static_cast<int>(info.arena_extend_strategy);
-    rocm_options.miopen_conv_exhaustive_search = info.miopen_conv_exhaustive_search;
-    rocm_options.do_copy_in_default_stream = info.do_copy_in_default_stream;
-    rocm_options.has_user_compute_stream = info.has_user_compute_stream;
-    rocm_options.user_compute_stream = info.user_compute_stream;
-    rocm_options.default_memory_arena_cfg = info.default_memory_arena_cfg;
-    rocm_options.tunable_op_enable = info.tunable_op.enable;
-    rocm_options.tunable_op_tuning_enable = info.tunable_op.tuning_enable;
-    rocm_options.tunable_op_max_tuning_duration_ms = info.tunable_op.max_tuning_duration_ms;
+    rocm_options.device_id = internal_options.device_id;
+    rocm_options.gpu_mem_limit = internal_options.gpu_mem_limit;
+    rocm_options.arena_extend_strategy = static_cast<int>(internal_options.arena_extend_strategy);
+    rocm_options.miopen_conv_exhaustive_search = internal_options.miopen_conv_exhaustive_search;
+    rocm_options.do_copy_in_default_stream = internal_options.do_copy_in_default_stream;
+    rocm_options.has_user_compute_stream = internal_options.has_user_compute_stream;
+    // The 'has_user_compute_stream' of the OrtROCMProviderOptions instance can be set by C API UpdateROCMProviderOptionsWithValue() as well.
+    // We only set the 'has_user_compute_stream' of the OrtROCMProviderOptions instance if it is provided in options
+    if (options.find("has_user_compute_stream") != options.end()) {
+      rocm_options.user_compute_stream = internal_options.user_compute_stream;
+    }
+    rocm_options.default_memory_arena_cfg = internal_options.default_memory_arena_cfg;
+    rocm_options.tunable_op_enable = internal_options.tunable_op.enable;
+    rocm_options.tunable_op_tuning_enable = internal_options.tunable_op.tuning_enable;
+    rocm_options.tunable_op_max_tuning_duration_ms = internal_options.tunable_op.max_tuning_duration_ms;
  }

  ProviderOptions GetProviderOptions(const void* provider_options) override {
--- a/onnxruntime/core/providers/rocm/rocm_provider_factory.h
+++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.h
@ -3,6 +3,7 @@

 #include "onnxruntime_c_api.h"
 #include "core/framework/provider_options.h"
+#include "core/common/common.h"

 namespace onnxruntime {
 class IAllocator;
@ -43,7 +44,16 @@ struct ProviderInfo_ROCM {
 #endif

  virtual std::shared_ptr<onnxruntime::IExecutionProviderFactory> CreateExecutionProviderFactory(const onnxruntime::ROCMExecutionProviderInfo& info) = 0;
-  virtual std::shared_ptr<onnxruntime::IAllocator> CreateRocmAllocator(int16_t device_id, size_t gpu_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::ROCMExecutionProviderExternalAllocatorInfo& external_allocator_info, OrtArenaCfg* default_memory_arena_cfg) = 0;
+  virtual std::shared_ptr<onnxruntime::IAllocator> CreateRocmAllocator(int16_t device_id, size_t gpu_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::ROCMExecutionProviderExternalAllocatorInfo& external_allocator_info, const OrtArenaCfg* default_memory_arena_cfg) = 0;
+
+  // This function is the entry point to ROCM EP's UT cases.
+  // All tests ared only called from onnxruntime_test_all.
+  virtual void TestAll() {
+    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is only implements in test code path.");
+  }
+
+ protected:
+  ~ProviderInfo_ROCM() = default;  // Can only be destroyed through a subclass instance
 };

 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/rocm/rocm_stream_handle.cc
+++ b/onnxruntime/core/providers/rocm/rocm_stream_handle.cc
@ -1,7 +1,9 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/rocm/rocm_resource.h"
 #include "core/providers/rocm/rocm_stream_handle.h"
 #include "core/providers/rocm/rocm_common.h"
 // #include "core/common/spin_pause.h"
-#include "core/providers/rocm/rocm_resource.h"

 namespace onnxruntime {

@ -82,15 +84,29 @@ void RocmStream::EnqueDeferredCPUBuffer(void* cpu_buffer) {
  deferred_cpu_buffers_.push_back(cpu_buffer);
 }

-struct CpuBuffersInfo {  // TODO: should be moved to base class
+struct CpuBuffersInfo {
+  // This struct stores the information needed
+  // to release CPU buffers allocated for GPU kernels.
+  // It's used to enqueue their release after
+  // associated GPU kernels in a ROCM stream.
+
+  // This is a CPU allocator in ROCM EP.
+  // It must be the one used to allocate the
+  // following pointers.
  AllocatorPtr allocator;
+  // buffers[i] is the i-th pointer added by
+  // AddDeferredReleaseCPUPtr for a specific
+  // ROCM stream. For example, this fields
+  // should contain all values in
+  // deferred_release_buffer_pool_[my_stream]
+  // when release my_stream's buffers.
  std::unique_ptr<void*[]> buffers;
  // CPU buffer buffers[i].
  // Number of buffer points in "buffers".
  size_t n_buffers;
 };

-static void ReleaseCpuBufferCallback(hipStream_t /*stream*/, hipError_t /*status*/, void* raw_info) {  // TODO: should be moved to base class
+static void ReleaseCpuBufferCallback(void* raw_info) {
  std::unique_ptr<CpuBuffersInfo> info = std::make_unique<CpuBuffersInfo>();
  info.reset(reinterpret_cast<CpuBuffersInfo*>(raw_info));
  for (size_t i = 0; i < info->n_buffers; ++i) {
@ -111,14 +127,7 @@ Status RocmStream::CleanUpOnRunEnd() {
      cpu_buffers_info->buffers[i] = deferred_cpu_buffers_.at(i);
    }
    cpu_buffers_info->n_buffers = deferred_cpu_buffers_.size();
-    // TODO(wechi): CUDA deprecates cudaStreamAddCallback and
-    // uses another API, cudaLaunchHostFunc(which can be
-    // captured in CUDA graph). Once AMD adds similar feature,
-    // we should replace the following line with
-    //  hipLaunchHostFunc(stream, ReleaseCpuBufferCallback, cpu_buffers_info);
-
-    // Release memory asynchronously to avoid blocking the compute stream.
-    HIP_RETURN_IF_ERROR(hipStreamAddCallback(static_cast<hipStream_t>(GetHandle()), ReleaseCpuBufferCallback, cpu_buffers_info.release(), 0));
+    HIP_RETURN_IF_ERROR(hipLaunchHostFunc(static_cast<hipStream_t>(GetHandle()), ReleaseCpuBufferCallback, cpu_buffers_info.release()));
  } else {
    HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(GetHandle())));
    for (auto* buffer : deferred_cpu_buffers_) {
@ -130,10 +139,10 @@ Status RocmStream::CleanUpOnRunEnd() {
  return Status::OK();
 }

-void* RocmStream::GetResource(int version, int type) const {
+void* RocmStream::GetResource(int version, int id) const {
  ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!");
  void* resource{};
-  switch (type) {
+  switch (id) {
    case RocmResource::hip_stream_t:
      return reinterpret_cast<void*>(GetHandle());
      break;
@ -149,6 +158,7 @@ void* RocmStream::GetResource(int version, int type) const {
  return resource;
 }

+// CPU Stream command handles
 void WaitRocmNotificationOnDevice(Stream& stream, synchronize::Notification& notification) {
  static_cast<RocmNotification*>(&notification)->wait_on_device(stream);
 }
--- a/onnxruntime/core/providers/rocm/rocm_stream_handle.h
+++ b/onnxruntime/core/providers/rocm/rocm_stream_handle.h
@ -1,3 +1,6 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #pragma once
 #include "core/providers/rocm/rocm_pch.h"
 // #include "core/providers/cuda/shared_inc/cuda_utils.h"
@ -17,14 +20,12 @@ struct RocmStream : Stream {

  ~RocmStream();

-  std::unique_ptr<synchronize::Notification> CreateNotification(size_t num_consumers) override;
+  std::unique_ptr<synchronize::Notification> CreateNotification(size_t /*num_consumers*/) override;

  void Flush() override;

  Status CleanUpOnRunEnd() override;

-  void* GetResource(int version, int id) const override;
-
  void EnqueDeferredCPUBuffer(void* cpu_buffer);

  bool own_stream_{true};
@ -33,6 +34,8 @@ struct RocmStream : Stream {

  rocblas_handle rocblas_handle_{};

+  void* GetResource(int version, int id) const override;
+
 private:
  std::vector<void*> deferred_cpu_buffers_;
  AllocatorPtr cpu_allocator_;
--- a/onnxruntime/core/providers/rocm/rocm_utils.cu
+++ b/onnxruntime/core/providers/rocm/rocm_utils.cu
@ -30,13 +30,14 @@ template <typename T>
 void Fill(hipStream_t stream, T* output, T value, int64_t count) {
  int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
  HIP_LONG N = static_cast<HIP_LONG>(count);
-  _Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream>>>(output, value, N);
+  _Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(output, value, N);
 }
 template <typename T>
 class ConstantBufferImpl : public IConstantBuffer<T> {
 public:
-  ConstantBufferImpl(T val) : buffer_(nullptr), count_(0), val_(val) {}
-
+  ConstantBufferImpl(T val) : buffer_(nullptr), count_(0), val_(val) {
+  }
  ~ConstantBufferImpl() {
    if (buffer_)
      HIP_CALL_THROW(hipFree(buffer_));
@ -70,6 +71,7 @@ std::unique_ptr<IConstantBuffer<T>> CreateConstantOnes() {
 template std::unique_ptr<IConstantBuffer<float>> CreateConstantOnes<float>();
 template std::unique_ptr<IConstantBuffer<double>> CreateConstantOnes<double>();
 template std::unique_ptr<IConstantBuffer<half>> CreateConstantOnes<half>();
+template std::unique_ptr<IConstantBuffer<BFloat16>> CreateConstantOnes<BFloat16>();

 #define SPECIALIZED_FILL(T) \
  template void Fill<T>(hipStream_t stream, T * output, T value, int64_t count);
@ -81,6 +83,7 @@ SPECIALIZED_FILL(int64_t)
 SPECIALIZED_FILL(float)
 SPECIALIZED_FILL(double)
 SPECIALIZED_FILL(__half)
+SPECIALIZED_FILL(BFloat16)

 }  // namespace rocm
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
+++ b/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
@ -1,90 +0,0 @@
-//
-// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-
-#pragma once
-
-#include <iostream>
-#include <limits>
-#include <hip/hip_runtime.h>
-#include <cmath>
-#include "core/common/common.h"
-
-namespace onnxruntime {
-namespace rocm {
-
-// DivMod is a helper class for integer division and modulo operation.
-// There is a fast version for int type and a slow version for other type.
-template <typename T>
-struct DivMod {
-  DivMod(T d = 1) {
-    d_ = d == 0 ? 1 : d;
-    ORT_ENFORCE(d_ >= 1 && d_ <= std::numeric_limits<T>::max());
-  }
-
-  __host__ __device__ inline T div(T n) const {
-    return n / d_;
-  }
-
-  __host__ __device__ inline T mod(T n) const {
-    return n % d_;
-  }
-
-  __host__ __device__ inline void divmod(T n, T& q, T& r) const {
-    q = div(n);
-    r = n - q * d_;
-  }
-
-  T d_;  // divisor
-};
-
-// The code below is based on section 4 Unsigned division of paper https://gmplib.org/~tege/divcnst-pldi94.pdf
-// In current ORT, fast_divmod is used for calculating the position of a element in tensor,
-// so unsigned integer division from the paper is good enough for ORT. The advantage is that div is very simple,
-// then GPU compiler can do loop unroll easilly when divmod is called in a loop.
-template <>
-struct DivMod<int> {
-  DivMod(int d = 1) {
-    d_ = d == 0 ? 1 : d;
-    ORT_ENFORCE(d_ >= 1 && d_ <= static_cast<uint32_t>(std::numeric_limits<int>::max()));
-
-    for (l_ = 0; l_ < 32; l_++)
-      if ((1U << l_) >= d_) break;
-
-    uint64_t one = 1;
-    uint64_t m = ((one << 32) * ((one << l_) - d_)) / d_ + 1;
-    M_ = static_cast<uint32_t>(m);
-    // according to paper, the value of m' should fit in a unsigned integer.
-    ORT_ENFORCE(M_ > 0 && M_ == m);
-  }
-
-  __host__ __device__ inline int div(int n) const {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-    uint32_t t = __umulhi(M_, n);
-    return (t + n) >> l_;
-#else
-    // Using uint64_t for t, then t + n won't overflow.
-    uint64_t t = ((uint64_t)M_ * n) >> 32;
-    return static_cast<int>((t + n) >> l_);
-#endif
-  }
-
-  __host__ __device__ inline int mod(int n) const {
-    return n - div(n) * d_;
-  }
-
-  __host__ __device__ inline void divmod(int n, int& q, int& r) const {
-    q = div(n);
-    r = n - q * d_;
-  }
-
-  uint32_t d_;  // divisor
-  uint32_t M_;  // m' in the paper.
-  uint32_t l_;  // l_ = ceil(log2(d_))
-};
-
-using fast_divmod = DivMod<int>;  // Keep the old name for backward compatibility.
-
-}  // namespace rocm
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/rocm/shared_inc/rocm_call.h
+++ b/onnxruntime/core/providers/rocm/shared_inc/rocm_call.h
@ -17,16 +17,20 @@ std::conditional_t<THRW, void, Status> RocmCall(

 #define HIP_CALL(expr) (RocmCall<hipError_t, false>((expr), #expr, "HIP", hipSuccess, "", __FILE__, __LINE__))
 #define ROCBLAS_CALL(expr) (RocmCall<rocblas_status, false>((expr), #expr, "ROCBLAS", rocblas_status_success, "", __FILE__, __LINE__))
+
 #define HIPSPARSE_CALL(expr) (RocmCall<hipsparseStatus_t, false>((expr), #expr, "HIPSPARSE", HIPSPARSE_STATUS_SUCCESS, "", __FILE__, __LINE__))
 #define HIPRAND_CALL(expr) (RocmCall<hiprandStatus_t, false>((expr), #expr, "HIPRAND", HIPRAND_STATUS_SUCCESS, "", __FILE__, __LINE__))
 #define MIOPEN_CALL(expr) (RocmCall<miopenStatus_t, false>((expr), #expr, "MIOPEN", miopenStatusSuccess, "", __FILE__, __LINE__))
 #define MIOPEN_CALL2(expr, m) (RocmCall<miopenStatus_t, false>((expr), #expr, "MIOPEN", miopenStatusSuccess, m, __FILE__, __LINE__))
+
 #define HIPFFT_CALL(expr) (RocmCall<hipfftResult, false>((expr), #expr, "HIPFFT", HIPFFT_SUCCESS, "", __FILE__, __LINE__))

 #define HIP_CALL_THROW(expr) (RocmCall<hipError_t, true>((expr), #expr, "HIP", hipSuccess, "", __FILE__, __LINE__))
 #define ROCBLAS_CALL_THROW(expr) (RocmCall<rocblas_status, true>((expr), #expr, "ROCBLAS", rocblas_status_success, "", __FILE__, __LINE__))
+
 #define HIPSPARSE_CALL_THROW(expr) (RocmCall<hipsparseStatus_t, true>((expr), #expr, "HIPSPARSE", HIPSPARSE_STATUS_SUCCESS, "", __FILE__, __LINE__))
 #define HIPRAND_CALL_THROW(expr) (RocmCall<hiprandStatus_t, true>((expr), #expr, "HIPRAND", HIPRAND_STATUS_SUCCESS, "", __FILE__, __LINE__))
+
 #define MIOPEN_CALL_THROW(expr) (RocmCall<miopenStatus_t, true>((expr), #expr, "MIOPEN", miopenStatusSuccess, "", __FILE__, __LINE__))
 #define MIOPEN_CALL_THROW2(expr, m) (RocmCall<miopenStatus_t, true>((expr), #expr, "MIOPEN", miopenStatusSuccess, m, __FILE__, __LINE__))
 #define HIPFFT_CALL_THROW(expr) (RocmCall<hipfftResult, true>((expr), #expr, "HIPFFT", HIPFFT_SUCCESS, "", __FILE__, __LINE__))
--- a/onnxruntime/test/contrib_ops/element_wise_ops_test.cc
+++ b/onnxruntime/test/contrib_ops/element_wise_ops_test.cc
@ -162,13 +162,13 @@ static void RunBiasGeluTestBFloat16(const std::vector<int64_t>& input_dims, cons
  tester.AddInput<BFloat16>("B", bias_dims, bias_data_bf16);
  tester.AddOutput<BFloat16>("C", input_dims, output_data_bf16);
  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-#ifdef USE_CUDA
+#if defined(USE_CUDA)
  execution_providers.push_back(DefaultCudaExecutionProvider());
-#elif USE_ROCM
+#elif defined(USE_ROCM)
  execution_providers.push_back(DefaultRocmExecutionProvider());
-#elif USE_DNNL
+#elif defined(USE_DNNL)
  execution_providers.push_back(DefaultDnnlExecutionProvider());
-#elif USE_DML
+#elif defined(USE_DML)
  execution_providers.push_back(DefaultDmlExecutionProvider());
 #endif
  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
@ -197,9 +197,8 @@ TEST(BiasGeluTest, BFloat16) {
 }
 #endif

+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(MathOpTest, ComplexMul) {
-  if (DefaultCudaExecutionProvider() == nullptr) return;
-
  std::vector<float> input_a_data = {
      -0.5f, 0.6f};

@ -219,13 +218,15 @@ TEST(MathOpTest, ComplexMul) {
  tester.AddOutput<float>("C", {4, 2}, output_data);

  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#if defined(USE_CUDA)
  execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif defined(USE_ROCM)
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }

 TEST(MathOpTest, ComplexMulConj) {
-  if (DefaultCudaExecutionProvider() == nullptr) return;
-
  std::vector<float> input_a_data = {
      -0.5f, 0.6f};

@ -245,13 +246,15 @@ TEST(MathOpTest, ComplexMulConj) {
  tester.AddOutput<float>("C", {4, 2}, output_data);

  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef USE_CUDA
  execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif defined(USE_ROCM)
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }

 TEST(MathOpTest, ComplexMul_fp16) {
-  if (DefaultCudaExecutionProvider() == nullptr) return;
-
  std::vector<MLFloat16> input_a_data = {
      MLFloat16(-0.5f), MLFloat16(0.6f)};

@ -271,13 +274,15 @@ TEST(MathOpTest, ComplexMul_fp16) {
  tester.AddOutput<MLFloat16>("C", {4, 2}, output_data);

  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef USE_CUDA
  execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif defined(USE_ROCM)
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }

 TEST(MathOpTest, ComplexMulConj_fp16) {
-  if (DefaultCudaExecutionProvider() == nullptr) return;
-
  std::vector<MLFloat16> input_a_data = {
      MLFloat16(-0.5f), MLFloat16(0.6f)};

@ -297,9 +302,14 @@ TEST(MathOpTest, ComplexMulConj_fp16) {
  tester.AddOutput<MLFloat16>("C", {4, 2}, output_data);

  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef USE_CUDA
  execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif defined(USE_ROCM)
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
+#endif

 }  // namespace test
 }  // namespace onnxruntime
--- a/onnxruntime/test/contrib_ops/fft_op_test.cc
+++ b/onnxruntime/test/contrib_ops/fft_op_test.cc
@ -8,7 +8,15 @@
 namespace onnxruntime {
 namespace test {
 TEST(ContribOpTest, Rfft) {
-  if (DefaultCudaExecutionProvider() == nullptr) return;
+  if (DefaultCudaExecutionProvider() == nullptr && DefaultRocmExecutionProvider() == nullptr) return;
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+  }
+  if (DefaultRocmExecutionProvider() != nullptr) {
+    execution_providers.push_back(DefaultRocmExecutionProvider());
+  }

  OpTester test("Rfft", 1, onnxruntime::kMSDomain);
  test.AddAttribute("signal_ndim", static_cast<int64_t>(1));
@ -17,13 +25,19 @@ TEST(ContribOpTest, Rfft) {
  // Target values conputed using PyTorch torch.fft.rfft(X, dim=-1, norm="backward")
  test.AddInput<float>("X", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
  test.AddOutput<float>("Y", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
-  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-  execution_providers.push_back(DefaultCudaExecutionProvider());
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }

 TEST(ContribOpTest, Irfft) {
-  if (DefaultCudaExecutionProvider() == nullptr) return;
+  if (DefaultCudaExecutionProvider() == nullptr && DefaultRocmExecutionProvider() == nullptr) return;
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+  }
+  if (DefaultRocmExecutionProvider() != nullptr) {
+    execution_providers.push_back(DefaultRocmExecutionProvider());
+  }

  OpTester test("Irfft", 1, onnxruntime::kMSDomain);
  test.AddAttribute("signal_ndim", static_cast<int64_t>(1));
@ -31,8 +45,6 @@ TEST(ContribOpTest, Irfft) {
  test.AddAttribute("normalized", static_cast<int64_t>(0));
  test.AddInput<float>("X", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
  test.AddOutput<float>("Y", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
-  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-  execution_providers.push_back(DefaultCudaExecutionProvider());
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 }  // namespace test
--- a/onnxruntime/test/contrib_ops/greedy_search_test.cc
+++ b/onnxruntime/test/contrib_ops/greedy_search_test.cc
@ -50,12 +50,26 @@ TEST(GreedySearchTest, GptGreedySearchFp16_VocabPadded) {
  const char* input_names[] = {"input_ids", "max_length", "min_length", "repetition_penalty"};
  const char* const output_names[] = {"sequences"};

-  constexpr int min_cuda_architecture = 530;
-  if (HasCudaEnvironment(min_cuda_architecture)) {
-    Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  constexpr int min_cuda_architecture = 530;
+  bool is_cuda = HasCudaEnvironment(min_cuda_architecture);
+#else
+  bool is_cuda = false;
 #endif
+#ifdef USE_ROCM
+  bool is_rocm = true;
+#else
+  bool is_rocm = false;
+#endif
+
+  if (is_cuda || is_rocm) {
+    Ort::SessionOptions session_options;
+    if (is_cuda) {
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    }
+    if (is_rocm) {
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
+    }

    // The following model was obtained by padding the vocabulary size in testdata/transformers/tiny_gpt2_beamsearch_fp16.onnx
    // (by making beam_size == 1) from 1000 to 1600 (just for illustrative and testing purposes) to see if the greedy search
@ -117,12 +131,26 @@ TEST(GreedySearchTest, GptGreedySearchFp32) {
  const char* input_names[] = {"input_ids", "max_length", "min_length", "repetition_penalty"};
  const char* const output_names[] = {"sequences"};

-  constexpr int min_cuda_architecture = 530;
-  if (HasCudaEnvironment(min_cuda_architecture)) {
-    Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  constexpr int min_cuda_architecture = 530;
+  bool is_cuda = HasCudaEnvironment(min_cuda_architecture);
+#else
+  bool is_cuda = false;
 #endif
+#ifdef USE_ROCM
+  bool is_rocm = true;
+#else
+  bool is_rocm = false;
+#endif
+
+  if (is_cuda || is_rocm) {
+    Ort::SessionOptions session_options;
+    if (is_cuda) {
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    }
+    if (is_rocm) {
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
+    }

    Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_greedysearch_with_init_decoder.onnx"), session_options);

--- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
@ -358,7 +358,11 @@ void RunTest(int64_t max_iterations,
    // we want the CUDA provider to be first, and the CPU provider second. all except the Loop node should run on
    // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Loop node correctly.
    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#if defined(USE_CUDA)
    execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif defined(USE_ROCM)
+    execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
    execution_providers.push_back(DefaultCpuExecutionProvider());

    test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}, nullptr, &execution_providers);
@ -1038,8 +1042,8 @@ TEST(Loop, IterationCountAsOutput) {
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }

-#ifdef USE_CUDA
-// test that when part of the subgraph run on CUDA it executes successfully
+#if defined(USE_CUDA) || defined(USE_ROCM)
+// test that when part of the subgraph run on CUDA/ROCm it executes successfully
 TEST(Loop, MixedExecutionProviders) {
  RunOptions options{};
  options.mixed_execution_providers = true;
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@ -411,7 +411,11 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
    // we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
    // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#if defined(USE_CUDA)
    execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif defined(USE_ROCM)
+    execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
    execution_providers.push_back(DefaultCpuExecutionProvider());

    test.Run(expect_result, failure_message, options.excluded_provider_types, nullptr, &execution_providers);
@ -1162,7 +1166,11 @@ void UnknownDimInSubgraphOutput(bool is_v8, bool mixed_execution_providers = fal
    // we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
    // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#if defined(USE_CUDA)
    execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif defined(USE_ROCM)
+    execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
    execution_providers.push_back(DefaultCpuExecutionProvider());

    test.Run(OpTester::ExpectResult::kExpectSuccess, "", RunOptions().excluded_provider_types, nullptr,
@ -1174,7 +1182,7 @@ void UnknownDimInSubgraphOutput(bool is_v8, bool mixed_execution_providers = fal

 TEST_8_AND_9(UnknownDimInSubgraphOutput);

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(Scan, MixedExecutionProviders) {
  RunOptions options{};
  options.is_v8 = false;
--- a/tools/ci_build/amd_hipify.py
+++ b/tools/ci_build/amd_hipify.py
@ -150,6 +150,8 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):

    # CUFFT -> HIPFFT
    s = s.replace("CUFFT", "HIPFFT")
+    s = s.replace("cufftXtMakePlanMany", "hipfftXtMakePlanMany")
+    s = s.replace("cufftXtExec", "hipfftXtExec")

    # Undo where above hipify steps went too far.
    s = s.replace("id, ROCM", "id, CUDA")  # cuda_execution_provider.cc
@ -169,6 +171,24 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
    s = s.replace("#include <hiprand_kernel.h>", "#include <hiprand/hiprand_kernel.h>")
    s = s.replace("#include <rocblas.h>", "#include <rocblas/rocblas.h>")
    s = s.replace("#include <hipblas.h>", "#include <hipblas/hipblas.h>")
+    s = s.replace("#include <hipfft.h>", "#include <hipfft/hipfft.h>")
+    s = s.replace('#include "hipfft.h"', "#include <hipfft/hipfft.h>")
+    s = s.replace('#include "hipfftXt.h"', "#include <hipfft/hipfftXt.h>")
+
+    # Fix onnxruntime/contrib_ops/rocm/transformers. They include cpu headers which use "cuda" in their names.
+    s = s.replace("rocm_device_prop_", "cuda_device_prop_")
+    s = s.replace("rocm_device_arch_", "cuda_device_arch_")
+
+    # We want hipfft, which needs hipDataType etc, but only do this for files that have "fft" in their names
+    # And we do this last, undoing or fixing hipify mistakes.
+    if "fft" in src_file_path:
+        s = s.replace("rocblas_datatype", "hipDataType")
+        s = s.replace("hipDataType_f32_c", "HIP_C_32F")
+        s = s.replace("hipDataType_f32_r", "HIP_R_32F")
+        s = s.replace("hipDataType_f64_c", "HIP_C_64F")
+        s = s.replace("hipDataType_f64_r", "HIP_R_64F")
+        s = s.replace("hipDataType_f16_c", "HIP_C_16F")
+        s = s.replace("hipDataType_f16_r", "HIP_R_16F")

    with open(dst_file_path, "w") as f:
        f.write(s)