diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc index 33bf3ce05e..e429a1eefb 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.cc +++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc @@ -166,13 +166,26 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, if (CanDoTranspose3D(new_rank, new_input_dims, new_permutations)) { return Transpose3DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), output.MutableDataRaw(), output.Shape().Size()); - } else if (CanDoTranspose4D(prop, element_size, new_rank, new_input_dims, new_permutations)) { + } else if (CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim( + prop, element_size, new_rank, new_input_dims, new_permutations)) { TArray tmp_output_strides(new_rank); for (auto i = 0; i < new_rank; i++) { tmp_output_strides[i] = new_output_strides[new_permutations[i]]; } - return Transpose4DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), - tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); + return Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim( + stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), + tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); + } else if (CanDoTranspose4DParallelizeOneElementPerThread( + prop, element_size, new_rank, new_input_dims, new_permutations)) { + // Trying to see if we can still do (best effort) more optimized transposing + // for the 4-D case before falling back to the generic case + TArray tmp_output_strides(new_rank); + for (auto i = 0; i < new_rank; i++) { + tmp_output_strides[i] = new_output_strides[new_permutations[i]]; + } + return Transpose4DParallelizeOneElementPerThread( + stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), + tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); } // General cases diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu index 10611c9cd9..006dce292f 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu @@ -80,9 +80,10 @@ Status Transpose3DImpl(cudaStream_t stream, size_t element_size, } template -__global__ void Transpose4DKernel(const TArray input_strides, const void* input_data, - const TArray output_strides, void* output_data, - CUDA_LONG N) { +__global__ void Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim( + const TArray input_strides, const void* input_data, + const TArray output_strides, void* output_data, + CUDA_LONG N) { // output coordinates will be: blockIdx.y, blockIdx.x, threadIdx.y, threadIdx.x CUDA_LONG input_index = (blockIdx.y * input_strides[0] + blockIdx.x * input_strides[1] + @@ -104,59 +105,69 @@ __global__ void Transpose4DKernel(const TArray input_strides, const voi } } -bool CanDoTranspose4D(const cudaDeviceProp& prop, - size_t element_size, - int32_t rank, - const std::vector& input_dims, - const std::vector& permutations) { +bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const cudaDeviceProp& prop, + size_t element_size, + int32_t rank, + const std::vector& input_dims, + const std::vector& permutations) { if (rank == 4 && // the permutations is not on the last dimension. - permutations[rank - 1] == (rank - 1)) { - // The block size will be set based on the last two dimensions of 4D tensor. + permutations[3] == 3) { + // The block size will be set based on the outer-most two dimensions of 4D tensor. // the number threads per block will be calculated as below. unsigned int num_elements_per_thread = 4 * sizeof(int) / static_cast(element_size); // int4 is used in the kernel to access data. - int64_t num_elements_in_last_two_dimensions = input_dims[rank - 2] * input_dims[rank - 1]; + int64_t num_elements_in_last_two_dimensions = input_dims[2] * input_dims[3]; int64_t num_threads_per_block = num_elements_in_last_two_dimensions / num_elements_per_thread; if (((num_elements_in_last_two_dimensions & (num_elements_per_thread - 1)) == 0) && num_threads_per_block <= prop.maxThreadsPerBlock && num_threads_per_block >= prop.warpSize && - // num_threads_per_block must be aligned with warp size: 32 - ((num_threads_per_block & (prop.warpSize - 1)) == 0)) { + // num_threads_per_block must be a multiple of warp size (32) + ((num_threads_per_block & (prop.warpSize - 1)) == 0) && + // input_dims[3] must be a multiple of `num_elements_per_thread` + ((input_dims[3] % num_elements_per_thread) == 0)) { return true; } } return false; } -Status Transpose4DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, - const TArray& output_strides, void* output_data, int N) { +Status Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim( + cudaStream_t stream, size_t element_size, + const TArray& input_shape, const TArray& input_strides, + const void* input_data, const TArray& output_strides, + void* output_data, int N) { unsigned int num_elements_per_thread = 4 * sizeof(int) / static_cast(element_size); // int4 is used in the kernel to access data. dim3 block_size(static_cast(input_shape[3] / num_elements_per_thread), static_cast(input_shape[2])); dim3 grid_size(static_cast(input_shape[1]), static_cast(input_shape[0])); switch (element_size) { case sizeof(int8_t): - Transpose4DKernel<<>>( - input_strides, input_data, - output_strides, output_data, N / num_elements_per_thread); + Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim + <<>>( + input_strides, input_data, + output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int16_t): - Transpose4DKernel<<>>( - input_strides, input_data, - output_strides, output_data, N / num_elements_per_thread); + Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim + <<>>( + input_strides, input_data, + output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int32_t): - Transpose4DKernel<<>>( - input_strides, input_data, - output_strides, output_data, N / num_elements_per_thread); + Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim + <<>>( + input_strides, input_data, + output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int64_t): - Transpose4DKernel<<>>( - input_strides, input_data, - output_strides, output_data, N / num_elements_per_thread); + Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim + <<>>( + input_strides, input_data, + output_strides, output_data, N / num_elements_per_thread); break; default: + // User will not hit this as this kernel is for fixed element size tensors only return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on CUDA. Element size was ", element_size); } @@ -164,6 +175,77 @@ Status Transpose4DImpl(cudaStream_t stream, size_t element_size, const TArray input_strides, const int8_t* input_data, + const TArray output_strides, int8_t* output_data, + size_t element_size, + CUDA_LONG N) { + CUDA_LONG input_index = blockIdx.y * input_strides[0] + + blockIdx.x * input_strides[1] + + threadIdx.y * input_strides[2] + + threadIdx.x * input_strides[3]; + + CUDA_LONG output_index = blockIdx.y * output_strides[0] + + blockIdx.x * output_strides[1] + + threadIdx.y * output_strides[2] + + threadIdx.x * output_strides[3]; + + if (input_index < N && output_index < N) { + const int8_t* input_data_to_be_copied = input_data + (input_index * element_size); + int8_t* output_data_to_be_copied = output_data + (output_index * element_size); + + // copy over the bytes + for (size_t iter = 0; iter < element_size; ++iter) { + *output_data_to_be_copied++ = *input_data_to_be_copied++; + } + } +} + +bool CanDoTranspose4DParallelizeOneElementPerThread(const cudaDeviceProp& prop, + size_t element_size, + int32_t rank, + const std::vector& input_dims, + const std::vector& permutations) { + if (rank == 4) { + // The block size will be set based on the outer-most two dimensions of 4D tensor. + // the number threads per block will be calculated as below. + int64_t number_of_threads_per_block = input_dims[2] * input_dims[3]; + + if (number_of_threads_per_block <= prop.maxThreadsPerBlock && + number_of_threads_per_block >= prop.warpSize && + // num_threads_per_block must be a multiple of warp size (32) + ((number_of_threads_per_block & (prop.warpSize - 1)) == 0)) { + return true; + } + } + return false; +} + +Status Transpose4DParallelizeOneElementPerThread( + cudaStream_t stream, size_t element_size, + const TArray& input_shape, const TArray& input_strides, + const void* input_data, const TArray& output_strides, + void* output_data, int N) { + if (element_size != sizeof(int8_t) && + element_size != sizeof(int16_t) && + element_size != sizeof(int32_t) && + element_size != sizeof(int64_t)) { + // User will not hit this as this kernel is for fixed element size tensors only + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on CUDA. Element size was ", + element_size); + } + + dim3 block_size(static_cast(input_shape[3]), static_cast(input_shape[2])); + dim3 grid_size(static_cast(input_shape[1]), static_cast(input_shape[0])); + + Transpose4DKernelParallelizeOneElementPerThread<<>>( + input_strides, reinterpret_cast(input_data), + output_strides, reinterpret_cast(output_data), + element_size, N); + + return Status::OK(); +} + template __global__ void TransposeKernel(int32_t shape_rank, const TArray input_strides, const T* input_data, const TArray output_strides, T* output_data, CUDA_LONG N) { diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h index 1a4d469776..a9184d2a16 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h @@ -11,13 +11,25 @@ namespace cuda { bool CanDoTranspose3D(int32_t rank, const std::vector& input_dims, const std::vector& permutations); Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, void* output_data, int64_t N); -bool CanDoTranspose4D(const cudaDeviceProp& prop, - size_t element_size, - int32_t rank, - const std::vector& input_dims, - const std::vector& permutations); -Status Transpose4DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, - const TArray& output_strides, void* output_data, int N); + +bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const cudaDeviceProp& prop, + size_t element_size, + int32_t rank, + const std::vector& input_dims, + const std::vector& permutations); +Status Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim(cudaStream_t stream, size_t element_size, const TArray& input_shape, + const TArray& input_strides, const void* input_data, + const TArray& output_strides, void* output_data, int N); + +bool CanDoTranspose4DParallelizeOneElementPerThread(const cudaDeviceProp& prop, + size_t element_size, + int32_t rank, + const std::vector& input_dims, + const std::vector& permutations); +Status Transpose4DParallelizeOneElementPerThread(cudaStream_t stream, size_t element_size, const TArray& input_shape, + const TArray& input_strides, const void* input_data, + const TArray& output_strides, void* output_data, int N); + Status TransposeImpl(cudaStream_t stream, size_t element_size, int32_t shape_rank, const TArray& input_strides, const void* input_data, const TArray& fdm_output_strides, void* output_data, int N); } // namespace cuda diff --git a/onnxruntime/core/providers/rocm/tensor/transpose.cc b/onnxruntime/core/providers/rocm/tensor/transpose.cc index 38b2a9cef1..61e1147abe 100644 --- a/onnxruntime/core/providers/rocm/tensor/transpose.cc +++ b/onnxruntime/core/providers/rocm/tensor/transpose.cc @@ -62,16 +62,16 @@ Status TransposeWithRocblas(hipStream_t stream, rocblas_handle rocblas_handle, c HipT* output_data = reinterpret_cast(output.MutableData()); ROCBLAS_RETURN_IF_ERROR( rocblasTransposeHelper(stream, - rocblas_handle, - rocblas_operation_transpose, rocblas_operation_transpose, M, N, - &one, - input_data, - N, - &zero, - input_data, - N, - output_data, - M)); + rocblas_handle, + rocblas_operation_transpose, rocblas_operation_transpose, M, N, + &one, + input_data, + N, + &zero, + input_data, + N, + output_data, + M)); return Status::OK(); } @@ -128,25 +128,25 @@ Status Transpose::DoTranspose(const hipDeviceProp_t& prop, new_permutations[j] -= 1; } } - for (auto j = i+1; j < new_rank; j++) { - new_permutations[j-1] = new_permutations[j]; + for (auto j = i + 1; j < new_rank; j++) { + new_permutations[j - 1] = new_permutations[j]; } // update input dims new_input_dims[prev] *= new_input_dims[curr]; new_input_dims[curr] = 1; - for (auto j = static_cast(curr+1); j < new_rank; j++) { - new_input_dims[j-1] = new_input_dims[j]; + for (auto j = static_cast(curr + 1); j < new_rank; j++) { + new_input_dims[j - 1] = new_input_dims[j]; } - new_input_dims[new_rank-1] = 1; + new_input_dims[new_rank - 1] = 1; // update output dims - new_output_dims[i-1] *= new_output_dims[i]; + new_output_dims[i - 1] *= new_output_dims[i]; new_output_dims[i] = 1; - for (auto j = i+1; j < new_rank; j++) { - new_output_dims[j-1] = new_output_dims[j]; + for (auto j = i + 1; j < new_rank; j++) { + new_output_dims[j - 1] = new_output_dims[j]; } - new_output_dims[new_rank-1] = 1; + new_output_dims[new_rank - 1] = 1; new_rank--; } @@ -166,13 +166,26 @@ Status Transpose::DoTranspose(const hipDeviceProp_t& prop, if (CanDoTranspose3D(new_rank, new_input_dims, new_permutations)) { return Transpose3DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), output.MutableDataRaw(), output.Shape().Size()); - } else if (CanDoTranspose4D(prop, element_size, new_rank, new_input_dims, new_permutations)) { + } else if (CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim( + prop, element_size, new_rank, new_input_dims, new_permutations)) { TArray tmp_output_strides(new_rank); for (auto i = 0; i < new_rank; i++) { tmp_output_strides[i] = new_output_strides[new_permutations[i]]; } - return Transpose4DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), - tmp_output_strides, output.MutableDataRaw(), output.Shape().Size()); + return Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim( + stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), + tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); + } else if (CanDoTranspose4DParallelizeOneElementPerThread( + prop, element_size, new_rank, new_input_dims, new_permutations)) { + // Trying to see if we can still do (best effort) more optimized transposing + // for the 4-D case before falling back to the generic case + TArray tmp_output_strides(new_rank); + for (auto i = 0; i < new_rank; i++) { + tmp_output_strides[i] = new_output_strides[new_permutations[i]]; + } + return Transpose4DParallelizeOneElementPerThread( + stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), + tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); } // General cases diff --git a/onnxruntime/test/providers/cpu/tensor/transpose_test.cc b/onnxruntime/test/providers/cpu/tensor/transpose_test.cc index b971d85072..515fa120c6 100644 --- a/onnxruntime/test/providers/cpu/tensor/transpose_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/transpose_test.cc @@ -590,26 +590,34 @@ static void TestTranspose( test.CompareWithCPU(kGpuExecutionProvider, error_tolerance); } -TEST(TransposeOpTest, Transpose0213) { +TEST(TransposeOpTest, Transpose0213) { // Will trigger Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim() const std::vector X_dims{64, 128, 16, 64}; const std::vector perm{0, 2, 1, 3}; const std::vector Y_dims{64, 16, 128, 64}; TestTranspose(perm, X_dims, Y_dims); } -TEST(TransposeOpTest, Transpose0231) { +TEST(TransposeOpTest, Transpose0213_V2) { // Will trigger Transpose4DParallelizeOneElementPerThread() + const std::vector X_dims{64, 128, 64, 2}; + const std::vector perm{0, 2, 1, 3}; + const std::vector Y_dims{64, 64, 128, 2}; + TestTranspose(perm, X_dims, Y_dims); +} + +TEST(TransposeOpTest, Transpose0231) { // Will trigger Transpose3DImpl() because of "flattening" of dims 2 and 3 into one dim const std::vector X_dims{64, 128, 16, 64}; const std::vector perm{0, 2, 3, 1}; const std::vector Y_dims{64, 16, 64, 128}; TestTranspose(perm, X_dims, Y_dims); } -TEST(TransposeOpTest, Transpose0312) { +TEST(TransposeOpTest, Transpose0312) { // Will trigger Transpose3DImpl() because of "flattening" of dims 1 and 2 into one dim const std::vector X_dims{64, 16, 64, 128}; const std::vector perm{0, 3, 1, 2}; const std::vector Y_dims{64, 128, 16, 64}; TestTranspose(perm, X_dims, Y_dims); } + #endif } // namespace test