Remove all usages of hipLaunchKernelGGL (#14089)

All thoes macro syntaxes are mistake by https://github.com/ROCm-Developer-Tools/HIP-CPU/issues/8#issuecomment-756188453, they should be corrected in documentation but is not. We moved away hipThreadIdx_* in some previous commits, now we move away from hipLaunchKernelGGL.
2026-07-29 20:14:01 +00:00 · 2023-01-02 12:55:44 +08:00 · 2023-01-02 12:55:44 +08:00 · 613920d6c5
commit 613920d6c5
parent 6a9dc6c993
5 changed files with 84 additions and 84 deletions
--- a/onnxruntime/contrib_ops/rocm/bert/attention_softmax.h
+++ b/onnxruntime/contrib_ops/rocm/bert/attention_softmax.h
@ -292,32 +292,32 @@ Status ComputeSoftmax(
  const dim3 grid(sequence_length * num_heads, batch_size, 1);
  if (all_sequence_length <= 32) {
    const int blockSize = 32;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
+    SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 64) {
    const int blockSize = 64;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
+    SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 128) {
    const int blockSize = 128;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
+    SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 256) {
    const int blockSize = 256;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
+    SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 512) {
    const int blockSize = 512;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
+    SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 1024) {
    const int blockSize = 1024;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
+    SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
  } else if (!is_unidirectional) {
    const int blockSize = 1024;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernel<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, add_before_softmax, input, output);
+    SoftmaxKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, add_before_softmax, input, output);
  } else {
    ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
  }
@ -403,39 +403,39 @@ Status ComputeSoftmaxWithMask1D(

  if (all_sequence_length <= 32) {
    const int blockSize = 32;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, mask_index, mask_start,
-                       add_before_softmax, input, output, is_unidirectional);
+    MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, mask_index, mask_start,
+        add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 64) {
    const int blockSize = 64;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, mask_index, mask_start,
-                       add_before_softmax, input, output, is_unidirectional);
+    MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, mask_index, mask_start,
+        add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 128) {
    const int blockSize = 128;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, mask_index, mask_start,
-                       add_before_softmax, input, output, is_unidirectional);
+    MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, mask_index, mask_start,
+        add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 256) {
    const int blockSize = 256;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, mask_index, mask_start,
-                       add_before_softmax, input, output, is_unidirectional);
+    MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, mask_index, mask_start,
+        add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 512) {
    const int blockSize = 512;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, mask_index, mask_start,
-                       add_before_softmax, input, output, is_unidirectional);
+    MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, mask_index, mask_start,
+        add_before_softmax, input, output, is_unidirectional);
  } else if (all_sequence_length <= 1024) {
    const int blockSize = 1024;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, mask_index, mask_start,
-                       add_before_softmax, input, output, is_unidirectional);
+    MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, mask_index, mask_start,
+        add_before_softmax, input, output, is_unidirectional);
  } else if (!is_unidirectional) {
    const int blockSize = 1024;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernel<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length, mask_index, mask_start,
-                       add_before_softmax, input, output);
+    MaskedSoftmaxKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length, mask_index, mask_start,
+        add_before_softmax, input, output);
  } else {
    ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
  }
@ -465,46 +465,46 @@ Status ComputeSoftmaxWithRawMask(hipStream_t stream,
  T* out = use_persistent_softmax ? persistent_softmax_workspace : output;
  if (all_sequence_length <= 32) {
    const int blockSize = 32;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length,
-                       attention_mask, key_padding_mask, add_before_softmax, input, out,
-                       is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
-                       use_persistent_softmax);
+    SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length,
+        attention_mask, key_padding_mask, add_before_softmax, input, out,
+        is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
+        use_persistent_softmax);
  } else if (all_sequence_length <= 64) {
    const int blockSize = 64;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length,
-                       attention_mask, key_padding_mask, add_before_softmax, input, out,
-                       is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
-                       use_persistent_softmax);
+    SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length,
+        attention_mask, key_padding_mask, add_before_softmax, input, out,
+        is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
+        use_persistent_softmax);
  } else if (all_sequence_length <= 128) {
    const int blockSize = 128;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length,
-                       attention_mask, key_padding_mask, add_before_softmax, input, out,
-                       is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
-                       use_persistent_softmax);
+    SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length,
+        attention_mask, key_padding_mask, add_before_softmax, input, out,
+        is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
+        use_persistent_softmax);
  } else if (all_sequence_length <= 256) {
    const int blockSize = 256;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length,
-                       attention_mask, key_padding_mask, add_before_softmax, input, out,
-                       is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
-                       use_persistent_softmax);
+    SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length,
+        attention_mask, key_padding_mask, add_before_softmax, input, out,
+        is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
+        use_persistent_softmax);
  } else if (all_sequence_length <= 512) {
    const int blockSize = 512;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length,
-                       attention_mask, key_padding_mask, add_before_softmax, input, out,
-                       is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
-                       use_persistent_softmax);
+    SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length,
+        attention_mask, key_padding_mask, add_before_softmax, input, out,
+        is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
+        use_persistent_softmax);
  } else if (all_sequence_length <= 1024) {
    const int blockSize = 1024;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
-                       all_sequence_length, sequence_length,
-                       attention_mask, key_padding_mask, add_before_softmax, input, out,
-                       is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
-                       use_persistent_softmax);
+    SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
+        all_sequence_length, sequence_length,
+        attention_mask, key_padding_mask, add_before_softmax, input, out,
+        is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
+        use_persistent_softmax);
  } else {
    ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
  }
--- a/onnxruntime/core/providers/rocm/fpgeneric.cu
+++ b/onnxruntime/core/providers/rocm/fpgeneric.cu
@ -58,7 +58,7 @@ rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocbla
    dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1);
    dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1);

-    hipLaunchKernelGGL(transposeNoOverlap, dim3(dimGrid), dim3(dimBlock), 0, stream, C, A, n, m);
+    transposeNoOverlap<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(C, A, n, m);
  } else {
    return rocblas_status_not_implemented;
  }
@ -68,7 +68,7 @@ rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocbla
 rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, const half* x, int incx, half* y, int incy) {
  dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
  dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
-  hipLaunchKernelGGL(CopyVectorHalf, dim3(dimGrid), dim3(dimBlock), 0, stream, x, incx, y, incy, n);
+  CopyVectorHalf<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(x, incx, y, incy, n);
  return rocblas_status_success;
 }

@ -76,6 +76,6 @@ rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, cons
                                onnxruntime::BFloat16* y, int incy) {
  dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
  dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
-  hipLaunchKernelGGL(CopyVectorBFloat16, dim3(dimGrid), dim3(dimBlock), 0, stream, x, incx, y, incy, n);
+  CopyVectorBFloat16<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(x, incx, y, incy, n);
  return rocblas_status_success;
-}
+}
--- a/onnxruntime/core/providers/rocm/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu
+++ b/onnxruntime/core/providers/rocm/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu
@ -63,21 +63,21 @@ void DiagonalImpl(

    switch (element_size) {
      case sizeof(int32_t):
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<int32_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+        _DiagonalKernel<int32_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
            reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
            input_strides, reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data), output_strides,
            output_size);
        break;

      case sizeof(int64_t):
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<int64_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+        _DiagonalKernel<int64_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
            reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
            input_strides, reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data), output_strides,
            output_size);
        break;

      case sizeof(int16_t):
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<half>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+        _DiagonalKernel<half><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
            reinterpret_cast<const half*>(input_data), input_rank, dim_1, dim_2,
            input_strides, reinterpret_cast<half*>(output_data), output_strides,
            output_size);
--- a/onnxruntime/core/providers/rocm/math/softmax_impl.cu
+++ b/onnxruntime/core/providers/rocm/math/softmax_impl.cu
@ -52,37 +52,37 @@ void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t* dst, const
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 0, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 0, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 1:  // 2
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 1, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 1, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 2:  // 4
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 2, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 2, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 3:  // 8
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 3, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 3, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 4:  // 16
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 4, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 4, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 5:  // 32
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 5, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 5, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 6:  // 64
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 6, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 6, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 7:  // 128
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 7, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 7, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 8:  // 256
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 8, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 8, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 9:  // 512
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 9, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 9, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      case 10:  // 1024
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 10, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
+        softmax_warp_forward<input_t, output_t, acc_t, 10, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
        break;
      default:
        break;
--- a/onnxruntime/core/providers/rocm/rocm_utils.cu
+++ b/onnxruntime/core/providers/rocm/rocm_utils.cu
@ -30,7 +30,7 @@ template <typename T>
 void Fill(hipStream_t stream, T* output, T value, int64_t count) {
  int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
  HIP_LONG N = static_cast<HIP_LONG>(count);
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(_Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, output, value, N);
+  _Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream>>>(output, value, N);
 }
 template <typename T>
 class ConstantBufferImpl : public IConstantBuffer<T> {