mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-08 00:23:03 +00:00
Remove all usages of hipLaunchKernelGGL (#14089)
All thoes macro syntaxes are mistake by https://github.com/ROCm-Developer-Tools/HIP-CPU/issues/8#issuecomment-756188453, they should be corrected in documentation but is not. We moved away hipThreadIdx_* in some previous commits, now we move away from hipLaunchKernelGGL.
This commit is contained in:
parent
6a9dc6c993
commit
613920d6c5
5 changed files with 84 additions and 84 deletions
|
|
@ -292,32 +292,32 @@ Status ComputeSoftmax(
|
|||
const dim3 grid(sequence_length * num_heads, batch_size, 1);
|
||||
if (all_sequence_length <= 32) {
|
||||
const int blockSize = 32;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 64) {
|
||||
const int blockSize = 64;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 128) {
|
||||
const int blockSize = 128;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 256) {
|
||||
const int blockSize = 256;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 512) {
|
||||
const int blockSize = 512;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 1024) {
|
||||
const int blockSize = 1024;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
SoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (!is_unidirectional) {
|
||||
const int blockSize = 1024;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxKernel<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output);
|
||||
SoftmaxKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, add_before_softmax, input, output);
|
||||
} else {
|
||||
ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
|
||||
}
|
||||
|
|
@ -403,39 +403,39 @@ Status ComputeSoftmaxWithMask1D(
|
|||
|
||||
if (all_sequence_length <= 32) {
|
||||
const int blockSize = 32;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 64) {
|
||||
const int blockSize = 64;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 128) {
|
||||
const int blockSize = 128;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 256) {
|
||||
const int blockSize = 256;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 512) {
|
||||
const int blockSize = 512;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (all_sequence_length <= 1024) {
|
||||
const int blockSize = 1024;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernelSmall<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
MaskedSoftmaxKernelSmall<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output, is_unidirectional);
|
||||
} else if (!is_unidirectional) {
|
||||
const int blockSize = 1024;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(MaskedSoftmaxKernel<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output);
|
||||
MaskedSoftmaxKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length, mask_index, mask_start,
|
||||
add_before_softmax, input, output);
|
||||
} else {
|
||||
ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
|
||||
}
|
||||
|
|
@ -465,46 +465,46 @@ Status ComputeSoftmaxWithRawMask(hipStream_t stream,
|
|||
T* out = use_persistent_softmax ? persistent_softmax_workspace : output;
|
||||
if (all_sequence_length <= 32) {
|
||||
const int blockSize = 32;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
} else if (all_sequence_length <= 64) {
|
||||
const int blockSize = 64;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
} else if (all_sequence_length <= 128) {
|
||||
const int blockSize = 128;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
} else if (all_sequence_length <= 256) {
|
||||
const int blockSize = 256;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
} else if (all_sequence_length <= 512) {
|
||||
const int blockSize = 512;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
} else if (all_sequence_length <= 1024) {
|
||||
const int blockSize = 1024;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(SoftmaxWithRawMaskSmallKernel<T, blockSize>), grid, blockSize, 0, stream,
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
SoftmaxWithRawMaskSmallKernel<T, blockSize><<<grid, blockSize, 0, stream>>>(
|
||||
all_sequence_length, sequence_length,
|
||||
attention_mask, key_padding_mask, add_before_softmax, input, out,
|
||||
is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length,
|
||||
use_persistent_softmax);
|
||||
} else {
|
||||
ORT_THROW("Attention ROCM operator does not support total sequence length > 1024.");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocbla
|
|||
dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1);
|
||||
dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1);
|
||||
|
||||
hipLaunchKernelGGL(transposeNoOverlap, dim3(dimGrid), dim3(dimBlock), 0, stream, C, A, n, m);
|
||||
transposeNoOverlap<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(C, A, n, m);
|
||||
} else {
|
||||
return rocblas_status_not_implemented;
|
||||
}
|
||||
|
|
@ -68,7 +68,7 @@ rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocbla
|
|||
rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, const half* x, int incx, half* y, int incy) {
|
||||
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
|
||||
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
|
||||
hipLaunchKernelGGL(CopyVectorHalf, dim3(dimGrid), dim3(dimBlock), 0, stream, x, incx, y, incy, n);
|
||||
CopyVectorHalf<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(x, incx, y, incy, n);
|
||||
return rocblas_status_success;
|
||||
}
|
||||
|
||||
|
|
@ -76,6 +76,6 @@ rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, cons
|
|||
onnxruntime::BFloat16* y, int incy) {
|
||||
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
|
||||
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
|
||||
hipLaunchKernelGGL(CopyVectorBFloat16, dim3(dimGrid), dim3(dimBlock), 0, stream, x, incx, y, incy, n);
|
||||
CopyVectorBFloat16<<<dim3(dimGrid), dim3(dimBlock), 0, stream>>>(x, incx, y, incy, n);
|
||||
return rocblas_status_success;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -63,21 +63,21 @@ void DiagonalImpl(
|
|||
|
||||
switch (element_size) {
|
||||
case sizeof(int32_t):
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<int32_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
|
||||
_DiagonalKernel<int32_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
|
||||
input_strides, reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data), output_strides,
|
||||
output_size);
|
||||
break;
|
||||
|
||||
case sizeof(int64_t):
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<int64_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
|
||||
_DiagonalKernel<int64_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
|
||||
input_strides, reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data), output_strides,
|
||||
output_size);
|
||||
break;
|
||||
|
||||
case sizeof(int16_t):
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(_DiagonalKernel<half>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
|
||||
_DiagonalKernel<half><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
reinterpret_cast<const half*>(input_data), input_rank, dim_1, dim_2,
|
||||
input_strides, reinterpret_cast<half*>(output_data), output_strides,
|
||||
output_size);
|
||||
|
|
|
|||
|
|
@ -52,37 +52,37 @@ void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t* dst, const
|
|||
// Launch code would be more elegant if C++ supported FOR CONSTEXPR
|
||||
switch (log2_elements) {
|
||||
case 0: // 1
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 0, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 0, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 1: // 2
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 1, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 1, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 2: // 4
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 2, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 2, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 3: // 8
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 3, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 3, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 4: // 16
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 4, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 4, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 5: // 32
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 5, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 5, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 6: // 64
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 6, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 6, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 7: // 128
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 7, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 7, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 8: // 256
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 8, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 8, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 9: // 512
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 9, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 9, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 10: // 1024
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward<input_t, output_t, acc_t, 10, is_log_softmax>), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 10, is_log_softmax><<<dim3(blocks), dim3(threads), 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ template <typename T>
|
|||
void Fill(hipStream_t stream, T* output, T value, int64_t count) {
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
HIP_LONG N = static_cast<HIP_LONG>(count);
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(_Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, output, value, N);
|
||||
_Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream>>>(output, value, N);
|
||||
}
|
||||
template <typename T>
|
||||
class ConstantBufferImpl : public IConstantBuffer<T> {
|
||||
|
|
|
|||
Loading…
Reference in a new issue