[ROCm] Enable Sampling Op UT on AMD (#14581)

Making basic porting effort to run Sampling UT on ROCm ep, based on the commits: https://github.com/microsoft/onnxruntime/pull/13426 https://github.com/microsoft/onnxruntime/pull/14218 1. enabling EmbedLayerNorm op 2. enabling Sampling op 3. enabling helpers to copy data from CPU->GPU for subgraph This task is the first checkpoint. There could be other missing ops when testing a real model. We will migrate more code onto ROCm as needed. Co-authored-by: Ubuntu <ettao@ettao-amd-dev1.zvflicr54joexhdgnhvmxrxygg.phxx.internal.cloudapp.net>
2026-06-20 02:07:56 +00:00 · 2023-02-06 20:52:06 -08:00 · 2023-02-06 20:52:06 -08:00 · d632f9a3fa
commit d632f9a3fa
parent a5dab850b8
12 changed files with 142 additions and 118 deletions
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@ -11,10 +11,6 @@ set(contrib_ops_excluded_files
  "bert/attention_softmax.h"
  "bert/multihead_attention.cc"
  "bert/multihead_attention.h"
-  "bert/embed_layer_norm.cc"
-  "bert/embed_layer_norm.h"
-  "bert/embed_layer_norm_impl.cu"
-  "bert/embed_layer_norm_impl.h"
  "bert/fast_gelu_impl.cu"
  "bert/fast_gelu_impl.h"
  "bert/fast_gelu.cc"
@ -85,17 +81,8 @@ set(contrib_ops_excluded_files
  "tensor/image_scaler_impl.h"
  "transformers/beam_search.cc"
  "transformers/beam_search.h"
-  "transformers/generation_device_helper.cc"
-  "transformers/generation_device_helper.h"
-  "transformers/generation_cuda_impl.cu"
-  "transformers/generation_cuda_impl.h"
  "transformers/greedy_search.cc"
  "transformers/greedy_search.h"
-  "transformers/sampling.cc"
-  "transformers/sampling.h"
-  "transformers/sampling_cuda_helper.h"
-  "transformers/dump_cuda_tensor.cc"
-  "transformers/dump_cuda_tensor.h"
  "conv_transpose_with_dynamic_pads.cc"
  "conv_transpose_with_dynamic_pads.h"
  "cuda_contrib_kernels.cc"
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
@ -116,7 +116,9 @@ const IExecutionProvider* Subgraph::GetProvider() const {
  const ExecutionProviders& providers = session_state_->GetExecutionProviders();
  const IExecutionProvider* cpu_provider = providers.Get(onnxruntime::kCpuExecutionProvider);
  const IExecutionProvider* cuda_provider = providers.Get(onnxruntime::kCudaExecutionProvider);
-  const IExecutionProvider* provider = cuda_provider ? cuda_provider : cpu_provider;
+  const IExecutionProvider* rocm_provider = providers.Get(onnxruntime::kRocmExecutionProvider);
+  const IExecutionProvider* gpu_provider = cuda_provider ? cuda_provider : rocm_provider;
+  const IExecutionProvider* provider = gpu_provider ? gpu_provider : cpu_provider;
  return provider;
 }

--- a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc
@ -17,12 +17,12 @@ class PinnedHostBuffer {
 public:
  PinnedHostBuffer(size_t length)
      : buffer_(nullptr) {
-    cudaHostAlloc(&buffer_, length * sizeof(T), cudaHostAllocDefault);
+    CUDA_CALL_THROW(cudaHostAlloc((void**)&buffer_, length * sizeof(T), cudaHostAllocDefault));
  }

  virtual ~PinnedHostBuffer() {
    if (buffer_) {
-      cudaFreeHost(buffer_);
+      CUDA_CALL_THROW(cudaFreeHost(buffer_));
    }
  }

@ -46,8 +46,9 @@ void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, bool i
  // In that case, we copy tensor data as well. It is not needed, but it keeps code simple.
  int num_items = dim0 * dim1;
  auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
-  cudaDeviceSynchronize();
-  cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
+  CUDA_CALL_THROW(cudaDeviceSynchronize());
+  CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));
+

  if (nullptr != name) {
    std::cout << std::string(name) << std::endl;
@ -64,8 +65,8 @@ template <typename T>
 void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, int dim2, bool is_gpu_tensor) {
  int num_items = dim0 * dim1 * dim2;
  auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
-  cudaDeviceSynchronize();
-  cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
+  CUDA_CALL_THROW(cudaDeviceSynchronize());
+  CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));

  if (nullptr != name) {
    std::cout << std::string(name) << std::endl;
@ -82,8 +83,8 @@ template <typename T>
 void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, int dim2, int dim3, bool is_gpu_tensor) {
  int num_items = dim0 * dim1 * dim2 * dim3;
  auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
-  cudaDeviceSynchronize();
-  cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
+  CUDA_CALL_THROW(cudaDeviceSynchronize());
+  CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));

  if (nullptr != name) {
    std::cout << std::string(name) << std::endl;
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@ -320,33 +320,33 @@ void GetTempStorageSize(const T* d_keys_in,
                        bool is_descending,
                        size_t& temp_storage_bytes) {
  if (is_descending) {
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
-                                                       temp_storage_bytes,
-                                                       d_keys_in,
-                                                       (T*)nullptr,
-                                                       d_values_in,
-                                                       (int*)nullptr,
-                                                       num_items,
-                                                       num_segments,
-                                                       d_offsets,
-                                                       d_offsets + 1,
-                                                       0,
-                                                       sizeof(T) * 8,
-                                                       stream);
+    CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
+                                                                       temp_storage_bytes,
+                                                                       d_keys_in,
+                                                                       (T*)nullptr,
+                                                                       d_values_in,
+                                                                       (int*)nullptr,
+                                                                       num_items,
+                                                                       num_segments,
+                                                                       d_offsets,
+                                                                       d_offsets + 1,
+                                                                       0,
+                                                                       sizeof(T) * 8,
+                                                                       stream));
  } else {
-    cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
-                                             temp_storage_bytes,
-                                             d_keys_in,
-                                             (T*)nullptr,
-                                             d_values_in,
-                                             (int*)nullptr,
-                                             num_items,
-                                             num_segments,
-                                             d_offsets,
-                                             d_offsets + 1,
-                                             0,
-                                             sizeof(T) * 8,
-                                             stream);
+    CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                                             temp_storage_bytes,
+                                                             d_keys_in,
+                                                             (T*)nullptr,
+                                                             d_values_in,
+                                                             (int*)nullptr,
+                                                             num_items,
+                                                             num_segments,
+                                                             d_offsets,
+                                                             d_offsets + 1,
+                                                             0,
+                                                             sizeof(T) * 8,
+                                                             stream));
  }
 }

@ -412,33 +412,33 @@ void LaunchSortPairs(void* d_temp_storage,
                     cudaStream_t stream,
                     bool is_descending) {
  if (is_descending) {
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
-                                                       temp_storage_bytes,
-                                                       d_keys_in,
-                                                       d_keys_out,
-                                                       d_values_in,
-                                                       d_values_out,
-                                                       num_items,
-                                                       num_segments,
-                                                       d_offsets,
-                                                       d_offsets + 1,
-                                                       0,
-                                                       sizeof(T) * 8,
-                                                       stream);
+    CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
+                                                                       temp_storage_bytes,
+                                                                       d_keys_in,
+                                                                       d_keys_out,
+                                                                       d_values_in,
+                                                                       d_values_out,
+                                                                       num_items,
+                                                                       num_segments,
+                                                                       d_offsets,
+                                                                       d_offsets + 1,
+                                                                       0,
+                                                                       sizeof(T) * 8,
+                                                                       stream));                                                      
  } else {
-    cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys_in,
-                                             d_keys_out,
-                                             d_values_in,
-                                             d_values_out,
-                                             num_items,
-                                             num_segments,
-                                             d_offsets,
-                                             d_offsets + 1,
-                                             0,
-                                             sizeof(T) * 8,
-                                             stream);
+    CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
+                                                             temp_storage_bytes,
+                                                             d_keys_in,
+                                                             d_keys_out,
+                                                             d_values_in,
+                                                             d_values_out,
+                                                             num_items,
+                                                             num_segments,
+                                                             d_offsets,
+                                                             d_offsets + 1,
+                                                             0,
+                                                             sizeof(T) * 8,
+                                                             stream));
  }
 }

@ -721,9 +721,9 @@ void TorchMultinomialKernelLauncher(float* d_input,
                                    cudaStream_t stream) {
  // Store the props in class variables
  int device;
-  cudaGetDevice(&device);
+  CUDA_CALL_THROW(cudaGetDevice(&device));
  cudaDeviceProp props;
-  cudaGetDeviceProperties(&props, device);
+  CUDA_CALL_THROW(cudaGetDeviceProperties(&props, device));

  int numSM = props.multiProcessorCount;
  int maxThreads = props.maxThreadsPerBlock;
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@ -17,14 +17,23 @@
 #include "contrib_ops/cpu/transformers/subgraph_gpt.h"
 #include "contrib_ops/cuda/transformers/beam_search_topk.h"
 #include "contrib_ops/cuda/transformers/greedy_search_top_one.h"
+
+// the includes would be dummy for ROCm, we will ignore them for now
+#ifdef ENABLE_NVTX_PROFILE
 #include "core/providers/cuda/nvtx_profile.h"
 #include "core/providers/cuda/nvtx_profile_context.h"
+#endif
+
 #include "sampling_cuda_helper.h"

 #ifdef DEBUG_GENERATION
 #include <iostream>
 #endif

+using onnxruntime::cuda::ToCudaType;
+using onnxruntime::cuda::TArray;
+using onnxruntime::cuda::TopKImpl;
+
 namespace onnxruntime {
 namespace concurrency {
 class ThreadPool;
@ -203,12 +212,13 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,

  // TODO(tianleiwu): we can use another stream to avoid blocking subgraph execution.
  cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
-  cudaMemsetAsync(beam_state->next_token_logits.data(), 0, beam_state->next_token_logits.size_bytes(), cuda_stream);
-  cudaMemsetAsync(beam_state->next_token_scores.data(), 0, beam_state->next_token_scores.size_bytes(), cuda_stream);
-  cudaMemsetAsync(beam_state->next_tokens.data(), 0, beam_state->next_tokens.size_bytes(), cuda_stream);
-  cudaMemsetAsync(beam_state->next_indices.data(), 0, beam_state->next_indices.size_bytes(), cuda_stream);
-  cudaMemsetAsync(beam_state->next_scores.data(), 0, beam_state->next_scores.size_bytes(), cuda_stream);
-  cudaMemsetAsync(beam_state->topk_buffer.data(), 0, beam_state->topk_buffer.size_bytes(), cuda_stream);
+  CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_token_logits.data(), 0, beam_state->next_token_logits.size_bytes(), cuda_stream));
+  CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_token_scores.data(), 0, beam_state->next_token_scores.size_bytes(), cuda_stream));
+  CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_tokens.data(), 0, beam_state->next_tokens.size_bytes(), cuda_stream));
+  CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_indices.data(), 0, beam_state->next_indices.size_bytes(), cuda_stream));
+  CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_scores.data(), 0, beam_state->next_scores.size_bytes(), cuda_stream));
+  CUDA_CALL_THROW(cudaMemsetAsync(beam_state->topk_buffer.data(), 0, beam_state->topk_buffer.size_bytes(), cuda_stream));
+

  // Initialize score of first beam of each group with 0 and the rest with -1e9.
  cuda::LaunchInitKernel(beam_state->beam_scores.data(), batch_size, num_beams, cuda_stream);
@ -216,8 +226,8 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,
  // copy sequence lengths to GPU
  // since next_positions is only needed to update feeds after subgraph execution, so it is fine to use Async here.
  if (!beam_state->next_positions.empty()) {  // next_positions is empty for T5
-    cudaMemcpyAsync(beam_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
-                    cudaMemcpyHostToDevice, cuda_stream);
+    CUDA_CALL_THROW(cudaMemcpyAsync(beam_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
+                    cudaMemcpyHostToDevice, cuda_stream));
  }

 #ifdef ENABLE_NVTX_PROFILE
@ -234,12 +244,12 @@ void InitGreedyState(transformers::IGreedySearchState<T>* greedy_state,
  initStateRange.Begin();
 #endif

-  cudaStream_t cuda_stream = ort_stream ? reinterpret_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
-  cudaMemsetAsync(greedy_state->next_token_scores.data(), 0, greedy_state->next_token_scores.size_bytes(), cuda_stream);
-  cudaMemsetAsync(greedy_state->next_positions.data(), 0, greedy_state->next_positions.size_bytes(), cuda_stream);
+  cudaStream_t cuda_stream = ort_stream ? reinterpret_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;             
+  CUDA_CALL_THROW(cudaMemsetAsync(greedy_state->next_token_scores.data(), 0, greedy_state->next_token_scores.size_bytes(), cuda_stream));
+  CUDA_CALL_THROW(cudaMemsetAsync(greedy_state->next_positions.data(), 0, greedy_state->next_positions.size_bytes(), cuda_stream));

-  cudaMemcpyAsync(greedy_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
-                  cudaMemcpyHostToDevice, cuda_stream);
+  CUDA_CALL_THROW(cudaMemcpyAsync(greedy_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
+                  cudaMemcpyHostToDevice, cuda_stream));

 #ifdef ENABLE_NVTX_PROFILE
  initStateRange.End();
--- a/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h
@ -11,6 +11,9 @@
 #include <iostream>
 #endif

+using onnxruntime::cuda::ToCudaType;
+using onnxruntime::cuda::dispatch_blockwise_softmax_forward;
+
 namespace onnxruntime {
 namespace contrib {
 namespace SamplingCudaHelper {
--- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
@ -69,6 +69,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Sampling);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh);
@ -166,8 +167,8 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
                                                                  1, MLFloat16, DecoderAttention)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int64_t, DynamicSlice)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ImageScaler)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ImageScaler)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ImageScaler)>,
@ -178,6 +179,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Sampling)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh)>,
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@ -1004,7 +1004,7 @@ Status SessionState::CreateSubgraphSessionState() {
  for (auto& node : graph_.Nodes()) {
    for (auto& entry : node.GetAttributeNameToMutableSubgraphMap()) {
      const auto& ep = node.GetExecutionProviderType();
-      if (!ep.empty() && ep != kCpuExecutionProvider && ep != kCudaExecutionProvider) {
+      if (!ep.empty() && ep != kCpuExecutionProvider && ep != kCudaExecutionProvider && ep != kRocmExecutionProvider) {
        // SessionState is only used when ORT is executing the subgraph. If a non-ORT EP has taken the control flow
        // node containing the subgraph it will create whatever state it needs internally.
        continue;
--- a/onnxruntime/core/providers/rocm/math/softmax_impl.cu
+++ b/onnxruntime/core/providers/rocm/math/softmax_impl.cu
@ -131,6 +131,9 @@ SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(half, half, float)
 SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(double, double, double)
 SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(BFloat16, BFloat16, float)

+#ifndef DISABLE_CONTRIB_OPS
+SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(half, float, float)  // used by BeamSearch op
+#endif

 }
 }
--- a/onnxruntime/test/contrib_ops/embed_layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/embed_layer_norm_op_test.cc
@ -17,10 +17,11 @@ static void RunTest(const embedlayernorm::OpData& data,
  int min_cuda_architecture = use_float16 ? 530 : 0;

  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+  bool enable_rocm = DefaultRocmExecutionProvider().get() != nullptr;
  bool enable_dml = DefaultDmlExecutionProvider().get() != nullptr;
  bool enable_cpu = !use_float16;

-  if (enable_cpu || enable_cuda || enable_dml) {
+  if (enable_cpu || enable_cuda || enable_dml || enable_rocm) {
    // Input and output shapes
    //   Input 0 - input_ids          : (batch_size, sequence_size)
    //   Input 1 - segment_ids        : (batch_size, sequence_size)
@ -143,6 +144,10 @@ static void RunTest(const embedlayernorm::OpData& data,
      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
      execution_providers.push_back(DefaultCudaExecutionProvider());
      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+    } else if (enable_rocm) {
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultRocmExecutionProvider());
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
    } else if (enable_dml) {
      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
      execution_providers.push_back(DefaultDmlExecutionProvider());
--- a/onnxruntime/test/contrib_ops/sampling_test.cc
+++ b/onnxruntime/test/contrib_ops/sampling_test.cc
@ -14,8 +14,8 @@ namespace onnxruntime {
 namespace test {

 #if defined(__linux__) && !defined(__ANDROID__)
-#ifdef USE_CUDA
-TEST(SamplingTest, Gpt2Sampling_CUDA) {
+#if defined(USE_CUDA) || defined(USE_ROCM)
+TEST(SamplingTest, Gpt2Sampling_GPU) {
  std::vector<int32_t> input_ids{
      0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620,
      41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572,
@ -25,7 +25,6 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
  std::vector<int32_t> min_length{1};
  std::vector<float> repetition_penalty{1.0f};

-
  std::vector<int32_t> expected_output{
      0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620, 125, 543, 668,
      41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572, 776, 213, 697,
@ -35,9 +34,7 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
  const int64_t sequence_length = 12;

  std::vector<int64_t> input_ids_shape{batch_size, sequence_length};
-
  std::vector<int64_t> parameter_shape{1};
-
  std::vector<int64_t> expected_output_shape{input_ids_shape[0], max_length[0]};

  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
@ -62,28 +59,36 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
  const char* const output_names[] = {"sequences"};

  Ort::SessionOptions session_options;
+#ifdef USE_CUDA
  constexpr int min_cuda_architecture = 530;
-  if (HasCudaEnvironment(min_cuda_architecture)) {
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
-
-    Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_sampling.onnx"), session_options);
-
-    auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
-                                   output_names, 1);
-
-    ASSERT_EQ(ort_outputs.size(), 1U);
-    const auto& sequences = ort_outputs[0];
-    ASSERT_TRUE(sequences.IsTensor());
-
-    auto result_ts = sequences.GetTensorTypeAndShapeInfo();
-    ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType());
-
-    ASSERT_EQ(expected_output_shape, result_ts.GetShape());
-    const auto* result_vals = sequences.GetTensorData<int32_t>();
-    auto result_span = gsl::make_span(result_vals, expected_output.size());
-
-    ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
+  if (!HasCudaEnvironment(min_cuda_architecture)) {
+    LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture";
+    return;
  }
+  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+#else // USE_ROCM
+  OrtROCMProviderOptions rocm_options;
+  // TODO - verify the default settings
+  session_options.AppendExecutionProvider_ROCM(rocm_options);
+#endif
+
+  Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_sampling.onnx"), session_options);
+
+  auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
+                                 output_names, 1);
+
+  ASSERT_EQ(ort_outputs.size(), 1U);
+  const auto& sequences = ort_outputs[0];
+  ASSERT_TRUE(sequences.IsTensor());
+
+  auto result_ts = sequences.GetTensorTypeAndShapeInfo();
+  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType());
+
+  ASSERT_EQ(expected_output_shape, result_ts.GetShape());
+  const auto* result_vals = sequences.GetTensorData<int32_t>();
+  auto result_span = gsl::make_span(result_vals, expected_output.size());
+
+  ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
 }
 #endif

--- a/tools/ci_build/amd_hipify.py
+++ b/tools/ci_build/amd_hipify.py
@ -59,6 +59,8 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
    s = s.replace("GPU_WARP_SIZE = 32", "GPU_WARP_SIZE = 64")
    s = s.replace("std::exp", "expf")
    s = s.replace("std::log", "logf")
+    s = s.replace("WaitCudaNotificationOnDevice", "WaitRocmNotificationOnDevice")
+    s = s.replace("hipHostAlloc", "hipHostMalloc")
    s = s.replace(
        "#include <cub/device/device_radix_sort.cuh>",
        "#include <hipcub/hipcub.hpp>\n#include <hipcub/backend/rocprim/device/device_radix_sort.hpp>",
@ -67,6 +69,10 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
        '#include "cub/device/device_radix_sort.cuh"',
        "#include <hipcub/hipcub.hpp>\n#include <hipcub/backend/rocprim/device/device_radix_sort.hpp>",
    )
+    s = s.replace(
+        "#include <cub/device/device_segmented_radix_sort.cuh>",
+        "#include <hipcub/backend/rocprim/device/device_segmented_radix_sort.hpp>",
+    )
    s = s.replace(
        "#include <cub/device/device_reduce.cuh>", "#include <hipcub/backend/rocprim/device/device_reduce.hpp>"
    )