Remove all usages of hipLaunchKernelGGL (#14089)

All thoes macro syntaxes are mistake by
https://github.com/ROCm-Developer-Tools/HIP-CPU/issues/8#issuecomment-756188453,
they should be corrected in documentation but is not. We moved away
hipThreadIdx_* in some previous commits, now we move away from
hipLaunchKernelGGL.
This commit is contained in:
cloudhan 2023-01-02 12:55:44 +08:00 committed by GitHub
parent 6a9dc6c993
commit 613920d6c5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 84 additions and 84 deletions

View file

@ -292,32 +292,32 @@ Status ComputeSoftmax(
const dim3 grid(sequence_length * num_heads, batch_size, 1);
if (all_sequence_length <= 32) {
const int blockSize = 32;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 64) {
const int blockSize = 64;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 128) {
const int blockSize = 128;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 256) {
const int blockSize = 256;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 512) {
const int blockSize = 512;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 1024) {
const int blockSize = 1024;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
} else if (!is_unidirectional) {
const int blockSize = 1024;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernel<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, add_before_softmax, input, output);
SoftmaxKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, add_before_softmax, input, output);
} else {
ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
}
@ -403,39 +403,39 @@ Status ComputeSoftmaxWithMask1D(
if (all_sequence_length <= 32) {
const int blockSize = 32;
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 64) {
const int blockSize = 64;
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 128) {
const int blockSize = 128;
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 256) {
const int blockSize = 256;
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 512) {
const int blockSize = 512;
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
} else if (all_sequence_length <= 1024) {
const int blockSize = 1024;
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output, is_unidirectional);
} else if (!is_unidirectional) {
const int blockSize = 1024;
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernel<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output);
MaskedSoftmaxKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length, mask_index, mask_start,
add_before_softmax, input, output);
} else {
ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
}
@ -465,46 +465,46 @@ Status ComputeSoftmaxWithRawMask(hipStream_t stream,
T* out = use_persistent_softmax ? persistent_softmax_workspace : output;
if (all_sequence_length <= 32) {
const int blockSize = 32;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
} else if (all_sequence_length <= 64) {
const int blockSize = 64;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
} else if (all_sequence_length <= 128) {
const int blockSize = 128;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
} else if (all_sequence_length <= 256) {
const int blockSize = 256;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
} else if (all_sequence_length <= 512) {
const int blockSize = 512;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
} else if (all_sequence_length <= 1024) {
const int blockSize = 1024;
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
all_sequence_length, sequence_length,
attention_mask, key_padding_mask, add_before_softmax, input, out,
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
use_persistent_softmax);
} else {
ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
}

View file

@ -58,7 +58,7 @@ rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocbla
dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1);
dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1);
hipLaunchKernelGGL(transposeNoOverlap, dim3(dimGrid), dim3(dimBlock), 0, stream, C, A, n, m);
transposeNoOverlap<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(C, A, n, m);
} else {
return rocblas_status_not_implemented;
}
@ -68,7 +68,7 @@ rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocbla
rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, const half* x, int incx, half* y, int incy) {
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
hipLaunchKernelGGL(CopyVectorHalf, dim3(dimGrid), dim3(dimBlock), 0, stream, x, incx, y, incy, n);
CopyVectorHalf<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(x, incx, y, incy, n);
return rocblas_status_success;
}
@ -76,6 +76,6 @@ rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, cons
onnxruntime::BFloat16* y, int incy) {
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
hipLaunchKernelGGL(CopyVectorBFloat16, dim3(dimGrid), dim3(dimBlock), 0, stream, x, incx, y, incy, n);
CopyVectorBFloat16<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(x, incx, y, incy, n);
return rocblas_status_success;
}
}

View file

@ -63,21 +63,21 @@ void DiagonalImpl(
switch (element_size) {
case sizeof(int32_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<int32_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
_DiagonalKernel<int32_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
input_strides, reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data), output_strides,
output_size);
break;
case sizeof(int64_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<int64_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
_DiagonalKernel<int64_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
input_strides, reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data), output_strides,
output_size);
break;
case sizeof(int16_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<half>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
_DiagonalKernel<half><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
reinterpret_cast<const half*>(input_data), input_rank, dim_1, dim_2,
input_strides, reinterpret_cast<half*>(output_data), output_strides,
output_size);

View file

@ -52,37 +52,37 @@ void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t* dst, const
// Launch code would be more elegant if C++ supported FOR CONSTEXPR
switch (log2_elements) {
case 0: // 1
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 0, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 0, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 1: // 2
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 1, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 1, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 2: // 4
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 2, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 2, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 3: // 8
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 3, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 3, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 4: // 16
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 4, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 4, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 5: // 32
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 5, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 5, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 6: // 64
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 6, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 6, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 7: // 128
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 7, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 7, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 8: // 256
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 8, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 8, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 9: // 512
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 9, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 9, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 10: // 1024
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 10, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
softmax_warp_forward<input_t, output_t, acc_t, 10, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
default:
break;

View file

@ -30,7 +30,7 @@ template <typename T>
void Fill(hipStream_t stream, T* output, T value, int64_t count) {
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
HIP_LONG N = static_cast<HIP_LONG>(count);
hipLaunchKernelGGL(HIP_KERNEL_NAME(_Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, output, value, N);
_Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream>>>(output, value, N);
}
template <typename T>
class ConstantBufferImpl : public IConstantBuffer<T> {