From 0bad5b1b5ab9277108f3bbcbe5c62d60a67bbd5f Mon Sep 17 00:00:00 2001 From: Vincent Wang Date: Sat, 16 Apr 2022 07:46:43 +0800 Subject: [PATCH] [CUDA] Rollback TileMemcpy and TileBatchedMemcpy when Block Size is Small (#11187) --- .../core/providers/cuda/tensor/tile_impl.cu | 172 ++++++++--- .../test/providers/cpu/tensor/tile_op_test.cc | 275 +++++++++--------- 2 files changed, 272 insertions(+), 175 deletions(-) diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu index 29632444b7..e3ef2965c5 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu @@ -33,9 +33,9 @@ __global__ void _UnRolledTileKernel(const size_t shape_rank, const TArray -__global__ void _TileMemcpyKernel(const T* input_data, T* output_data, const CUDA_LONG N, const size_t repeats) { +__global__ void _TileMemcpyKernelFromOutput(const T* input_data, T* output_data, + const fast_divmod divmod_num_input_elements, const CUDA_LONG N) { + CUDA_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x; + T value[num_elements_per_thread]; + CUDA_LONG id = start; +#pragma unroll + for (int i = 0; i < num_elements_per_thread; ++i) { + if (id < N) { + value[i] = input_data[divmod_num_input_elements.mod(id)]; + id += num_threads_per_block; + } + } + + id = start; +#pragma unroll + for (int i = 0; i < num_elements_per_thread; ++i) { + if (id < N) { + output_data[id] = value[i]; + id += num_threads_per_block; + } + } +} + +template +__global__ void _TileMemcpyKernelFromInput(const T* input_data, T* output_data, const CUDA_LONG N, + const size_t repeats) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); T input_val = input_data[id]; for (size_t i = 0; i < repeats; ++i) { @@ -76,42 +101,94 @@ __global__ void _TileMemcpyKernel(const T* input_data, T* output_data, const CUD } template -void TileMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t num_input_elements, - const size_t repeats) { - using Vec4T = aligned_vector; - using Vec2T = aligned_vector; - constexpr int vec4_alignment = std::alignment_of::value; - constexpr int vec2_alignment = std::alignment_of::value; - uint64_t address_input = reinterpret_cast(input_data); - uint64_t address_output = reinterpret_cast(output_data); - CUDA_LONG N = static_cast(num_input_elements); - if (num_input_elements % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) { +size_t GetVectorizedSize(size_t num_input_elements, size_t num_elements_per_batch, uint64_t address_input, + uint64_t address_output, CUDA_LONG& N, int& blocksPerGrid) { + constexpr int vec4_alignment = std::alignment_of>::value; + constexpr int vec2_alignment = std::alignment_of>::value; + N = static_cast(num_input_elements); + size_t vectorized_size = 1; + if (num_elements_per_batch % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) { N /= 4; - int blocksPerGrid = CeilDiv(N, num_threads_per_block); - _TileMemcpyKernel<<>>( - reinterpret_cast(input_data), reinterpret_cast(output_data), N, repeats); - return; - } else if (num_input_elements % 2 == 0 && address_input % vec2_alignment == 0 && + vectorized_size = 4; + } else if (num_elements_per_batch % 2 == 0 && address_input % vec2_alignment == 0 && address_output % vec2_alignment == 0) { N /= 2; - int blocksPerGrid = CeilDiv(N, num_threads_per_block); - _TileMemcpyKernel<<>>( + vectorized_size = 2; + } + blocksPerGrid = CeilDiv(N, num_threads_per_block); + return vectorized_size; +} + +template +void TileMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t num_input_elements, + const size_t repeats) { + // If the block number from input size is too small to fill all streaming multiprocessors, + // it won't have perf gain to launch from inputs. In this case we will use the output based kernel. + CUDA_LONG N; + int blocksPerGrid; + size_t vectorized_size = + GetVectorizedSize(num_input_elements, num_input_elements, reinterpret_cast(input_data), + reinterpret_cast(output_data), N, blocksPerGrid); + if (blocksPerGrid < 128) { + N = static_cast(num_input_elements * repeats); + blocksPerGrid = CeilDiv(N, num_threads_per_block * num_elements_per_thread); + _TileMemcpyKernelFromOutput<<>>( + input_data, output_data, fast_divmod(static_cast(num_input_elements)), N); + return; + } + + if (vectorized_size == 4) { + using Vec4T = aligned_vector; + _TileMemcpyKernelFromInput<<>>( + reinterpret_cast(input_data), reinterpret_cast(output_data), N, repeats); + return; + } else if (vectorized_size == 2) { + using Vec2T = aligned_vector; + _TileMemcpyKernelFromInput<<>>( reinterpret_cast(input_data), reinterpret_cast(output_data), N, repeats); return; } - int blocksPerGrid = CeilDiv(N, num_threads_per_block); - _TileMemcpyKernel<<>>(input_data, output_data, N, repeats); + _TileMemcpyKernelFromInput<<>>(input_data, output_data, N, repeats); +} + +template +__global__ void _TileBatchedMemcpyKernelFromOutput(const T* input_data, T* output_data, + const fast_divmod divmod_size_output_row, + const size_t size_input_row, const fast_divmod divmod_batch, + const fast_divmod divmod_size_input_row, const CUDA_LONG N) { + CUDA_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x; + T value[num_elements_per_thread]; + CUDA_LONG id = start; +#pragma unroll + for (int i = 0; i < num_elements_per_thread; ++i) { + if (id < N) { + int batch_idx, element_idx; + divmod_size_output_row.divmod(id, batch_idx, element_idx); + value[i] = input_data[divmod_batch.mod(batch_idx) * size_input_row + divmod_size_input_row.mod(element_idx)]; + id += num_threads_per_block; + } + } + + id = start; +#pragma unroll + for (int i = 0; i < num_elements_per_thread; ++i) { + if (id < N) { + output_data[id] = value[i]; + id += num_threads_per_block; + } + } } // Input size is [batch, data], output size is [batch * batch_repeats, data * repeats_per_batch]. // Here size_input_row = data, size_output_row = data * repeats_per_batch, // size_output_batch = batch * data * repeats_per_batch template -__global__ void _TileBatchedMemcpyKernel(const T* input_data, T* output_data, const fast_divmod divmod_size_input_row, - const CUDA_LONG size_input_row, const CUDA_LONG size_output_row, - const CUDA_LONG size_output_batch, const size_t batch_repeats, - const size_t repeats_per_batch, const CUDA_LONG N) { +__global__ void _TileBatchedMemcpyKernelFromInput(const T* input_data, T* output_data, + const fast_divmod divmod_size_input_row, + const CUDA_LONG size_input_row, const CUDA_LONG size_output_row, + const CUDA_LONG size_output_batch, const size_t batch_repeats, + const size_t repeats_per_batch, const CUDA_LONG N) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); T input_val = input_data[id]; CUDA_LONG q, r; @@ -133,29 +210,37 @@ template void TileBatchedMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t size_input_row, const size_t num_input_elements, const size_t batch_repeats, const size_t repeats_per_batch) { - using Vec4T = aligned_vector; - using Vec2T = aligned_vector; - constexpr int vec4_alignment = std::alignment_of::value; - constexpr int vec2_alignment = std::alignment_of::value; - uint64_t address_input = reinterpret_cast(input_data); - uint64_t address_output = reinterpret_cast(output_data); + // If the block number from input size is too small to fill all streaming multiprocessors, + // it won't have perf gain to launch from inputs. In this case we will use the output based kernel. + CUDA_LONG N; + int blocksPerGrid; + size_t vectorized_size = + GetVectorizedSize(num_input_elements, size_input_row, reinterpret_cast(input_data), + reinterpret_cast(output_data), N, blocksPerGrid); + if (blocksPerGrid < 128) { + N = static_cast(num_input_elements * batch_repeats * repeats_per_batch); + blocksPerGrid = CeilDiv(N, num_threads_per_block * num_elements_per_thread); + _TileBatchedMemcpyKernelFromOutput<<>>( + input_data, output_data, fast_divmod(static_cast(size_input_row * repeats_per_batch)), size_input_row, + fast_divmod(static_cast(num_input_elements / size_input_row)), + fast_divmod(static_cast(size_input_row)), N); + return; + } + CUDA_LONG size_input_row_vec = static_cast(size_input_row); - CUDA_LONG N = static_cast(num_input_elements); - if (size_input_row % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) { + if (vectorized_size == 4) { + using Vec4T = aligned_vector; size_input_row_vec /= 4; - N /= 4; - int blocksPerGrid = CeilDiv(N, num_threads_per_block); - _TileBatchedMemcpyKernel<<>>( + _TileBatchedMemcpyKernelFromInput<<>>( reinterpret_cast(input_data), reinterpret_cast(output_data), fast_divmod(size_input_row_vec), size_input_row_vec, size_input_row_vec * static_cast(repeats_per_batch), N * static_cast(repeats_per_batch), batch_repeats, repeats_per_batch, N); return; - } else if (size_input_row % 2 == 0 && address_input % vec2_alignment == 0 && address_output % vec2_alignment == 0) { + } else if (vectorized_size == 2) { + using Vec2T = aligned_vector; size_input_row_vec /= 2; - N /= 2; - int blocksPerGrid = CeilDiv(N, num_threads_per_block); - _TileBatchedMemcpyKernel<<>>( + _TileBatchedMemcpyKernelFromInput<<>>( reinterpret_cast(input_data), reinterpret_cast(output_data), fast_divmod(size_input_row_vec), size_input_row_vec, size_input_row_vec * static_cast(repeats_per_batch), N * static_cast(repeats_per_batch), @@ -163,8 +248,7 @@ void TileBatchedMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_d return; } - int blocksPerGrid = static_cast(CeilDiv(N, num_threads_per_block)); - _TileBatchedMemcpyKernel<<>>( + _TileBatchedMemcpyKernelFromInput<<>>( input_data, output_data, fast_divmod(size_input_row_vec), size_input_row_vec, size_input_row_vec * static_cast(repeats_per_batch), N * static_cast(repeats_per_batch), batch_repeats, repeats_per_batch, N); diff --git a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc index 45c36417f5..773a0cb3e2 100644 --- a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc @@ -8,210 +8,223 @@ namespace onnxruntime { namespace test { template -void RunTest(std::initializer_list input, - std::initializer_list input_dims, - std::initializer_list repeat, - std::initializer_list repeat_dims, - std::initializer_list output, - std::initializer_list output_dims) { +std::vector InputData(size_t size) { + std::vector result(size); + for (size_t i = 0; i < size; i++) { + result[i] = static_cast(i); + } + return result; +} + +template <> +std::vector InputData(size_t size) { + std::vector result(size); + for (size_t i = 0; i < size; i++) { + result[i] = MLFloat16(static_cast(i)); + } + return result; +} + +template +void RunTest(const std::vector& input_dims, const std::vector& repeats) { + size_t input_size = + static_cast(std::accumulate(input_dims.begin(), input_dims.end(), 1LL, std::multiplies())); + std::vector input_data = InputData(input_size); + size_t rank = input_dims.size(); + std::vector repeats_dims(1); + repeats_dims[0] = static_cast(rank); + std::vector output_dims(rank); + for (size_t i = 0; i < rank; ++i) { + output_dims[i] = input_dims[i] * repeats[i]; + } + size_t output_size = + static_cast(std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies())); + std::vector output_data(output_size); + std::vector input_strides(rank); + std::vector output_strides(rank); + input_strides[rank - 1] = output_strides[rank - 1] = 1; + if (rank > 1) { + for (size_t i = rank - 2;; --i) { + input_strides[i] = input_dims[i + 1] * input_strides[i + 1]; + output_strides[i] = output_dims[i + 1] * output_strides[i + 1]; + if (i == 0) break; + } + } + for (size_t i = 0; i < output_size; ++i) { + int64_t index = 0; + int64_t remain = static_cast(i); + for (size_t j = 0; j < rank; ++j) { + index += (((remain / output_strides[j]) % input_dims[j]) * input_strides[j]); + remain = remain % output_strides[j]; + } + output_data[i] = input_data[static_cast(index)]; + } OpTester test("Tile"); - test.AddInput("input", input_dims, input); - test.AddInput("repeats", repeat_dims, repeat); - test.AddOutput("output", output_dims, output); - if (std::is_same::value) - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT reports error: Assertion Error in makePaddedScale: 0 (regionRanges != nullptr) - else + test.AddInput("input", input_dims, input_data); + test.AddInput("repeats", repeats_dims, repeats); + test.AddOutput("output", output_dims, output_data); + if (std::is_same::value) { + // TensorRT reports error: Assertion Error in makePaddedScale: 0 (regionRanges != nullptr) + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); + } else { test.Run(); + } } template void RunTestWrapper() { // Tile1DWithZeroRepeats - RunTest({1, 2, 3}, {3}, {0}, {1}, {}, {0}); + RunTest({3}, {0}); // Tile2DWithZeroRepeats - RunTest({11, 12, 21, 22}, {2, 2}, {2, 0}, {2}, {}, {4, 0}); + RunTest({2, 2}, {2, 0}); // Tile1D - RunTest({1, 2, 3}, {3}, {3}, {1}, {1, 2, 3, 1, 2, 3, 1, 2, 3}, {9}); + RunTest({3}, {3}); // Tile2D_1Axis - RunTest({11, 12, 21, 22}, {2, 2}, {2, 1}, {2}, {11, 12, 21, 22, 11, 12, 21, 22}, {4, 2}); - RunTest({11, 12, 21, 22, 31, 32}, {2, 3}, {2, 1}, {2}, {11, 12, 21, 22, 31, 32, 11, 12, 21, 22, 31, 32}, {4, 3}); + RunTest({2, 2}, {2, 1}); + RunTest({2, 3}, {2, 1}); // Tile2D_2Axes - RunTest({11, 12, 21, 22}, {2, 2}, {2, 2}, {2}, {11, 12, 11, 12, 21, 22, 21, 22, 11, 12, 11, 12, 21, 22, 21, 22}, {4, 4}); - RunTest({11, 12, 13, 14, 21, 22, 23, 24}, {2, 4}, {2, 2}, {2}, - {11, 12, 13, 14, 11, 12, 13, 14, 21, 22, 23, 24, 21, 22, 23, 24, - 11, 12, 13, 14, 11, 12, 13, 14, 21, 22, 23, 24, 21, 22, 23, 24}, - {4, 8}); + RunTest({2, 2}, {2, 2}); + RunTest({2, 4}, {2, 2}); // Tile3D - RunTest({111, 112, 113, 122, 123, 124}, {2, 1, 3}, {1, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 122, 123, 124, 122, 123, 124}, {2, 2, 3}); + RunTest({2, 1, 3}, {1, 2, 1}); // Tile4D - RunTest( - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, // input - {1, 2, 3, 4}, // input dims - {2, 1, 2, 1}, // repeat - {4}, // repeat dims - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, // output - {2, 2, 6, 4} // output dims - ); + RunTest({1, 2, 3, 4}, {2, 1, 2, 1}); // Tile5D - RunTest( - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, - 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, - 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71}, // input - {2, 3, 2, 3, 2}, // input dims - {2, 1, 2, 1, 2}, // repeat - {5}, // repeat dims - {0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, - 8, 9, 10, 11, 10, 11, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, - 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, - 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, - 12, 13, 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, - 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, - 30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35, 24, 25, 24, 25, 26, 27, - 26, 27, 28, 29, 28, 29, 30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35, - 36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45, - 44, 45, 46, 47, 46, 47, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, - 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, - 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, - 48, 49, 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57, - 56, 57, 58, 59, 58, 59, 60, 61, 60, 61, 62, 63, 62, 63, 64, 65, 64, 65, - 66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71, 60, 61, 60, 61, 62, 63, - 62, 63, 64, 65, 64, 65, 66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71, - 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, - 8, 9, 10, 11, 10, 11, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, - 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, - 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, - 12, 13, 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, - 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, - 30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35, 24, 25, 24, 25, 26, 27, - 26, 27, 28, 29, 28, 29, 30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35, - 36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45, - 44, 45, 46, 47, 46, 47, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, - 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, - 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, - 48, 49, 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57, - 56, 57, 58, 59, 58, 59, 60, 61, 60, 61, 62, 63, 62, 63, 64, 65, 64, 65, - 66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71, 60, 61, 60, 61, 62, 63, - 62, 63, 64, 65, 64, 65, 66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71}, // output - {4, 3, 4, 3, 4} // output dims - ); + RunTest({2, 3, 2, 3, 2}, {2, 1, 2, 1, 2}); // Tile1DWithOneRepeats - RunTest({111, 112, 113, 122, 123, 124}, {2, 1, 3}, {1, 1, 1}, {3}, {111, 112, 113, 122, 123, 124}, {2, 1, 3}); + RunTest({2, 1, 3}, {1, 1, 1}); // TileWhichIsBasicallyCopiesOfInputBuffer - 1 // This will trigger the MemCpy optimization path - RunTest({111, 112, 113}, {1, 1, 3}, {2, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113, 111, 112, 113}, {2, 2, 3}); + RunTest({1, 1, 3}, {2, 2, 1}); // TileWhichIsBasicallyCopiesOfInputBuffer - 2 // This will trigger the MemCpy optimization path - RunTest({111, 112, 113}, {1, 1, 3}, {3, 1, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113}, {3, 1, 3}); + RunTest({1, 1, 3}, {3, 1, 1}); // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1) // This will trigger the (Batched) MemCpy optimization path - RunTest({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {1, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {2, 2, 3}); + RunTest({2, 1, 3}, {1, 2, 1}); // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1) // This will trigger the (Batched) MemCpy optimization path - RunTest({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {2, 2, 1}, {3}, - {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13, 111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {4, 2, 3}); + RunTest({2, 1, 3}, {2, 2, 1}); + +#if defined(USE_CUDA) || defined(USE_ROCM) + // _TileMemcpyKernelFromInput, vectorized 4 + RunTest({256, 512}, {3, 1}); + + // _TileMemcpyKernelFromInput, vectorized 2 + RunTest({258, 257}, {4, 1}); + + // _TileMemcpyKernelFromInput, non-vectorized + RunTest({129, 257}, {5, 1}); + + // _TileBatchedMemcpyKernelFromInput, vectorized 4 + RunTest({512, 256}, {1, 3}); + + // _TileBatchedMemcpyKernelFromInput, vectorized 2 + RunTest({257, 258}, {2, 2}); + + // _TileBatchedMemcpyKernelFromInput, non-vectorized + RunTest({129, 257}, {3, 2}); +#endif } -template <> -void RunTestWrapper() { +// OpTester's AddInput and AddOutput do not support std::vector. +void RunTestForBool(std::initializer_list input_data, std::initializer_list input_dims, + std::initializer_list repeats, std::initializer_list repeats_dims, + std::initializer_list output_data, std::initializer_list output_dims) { + OpTester test("Tile"); + test.AddInput("input", input_dims, input_data); + test.AddInput("repeats", repeats_dims, repeats); + test.AddOutput("output", output_dims, output_data); + test.Run(); +} + +void RunTestWrapperForBool() { // Tile1DWithZeroRepeats - RunTest({true, false, true}, {3}, {0}, {1}, {}, {0}); + RunTestForBool({true, false, true}, {3}, {0}, {1}, {}, {0}); // Tile2DWithZeroRepeats - RunTest({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0}); + RunTestForBool({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0}); // Tile1D - RunTest({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9}); + RunTestForBool({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9}); // Tile2D_1Axis - RunTest({true, false, true, false}, {2, 2}, {2, 1}, {2}, {true, false, true, false, true, false, true, false}, {4, 2}); + RunTestForBool({true, false, true, false}, {2, 2}, {2, 1}, {2}, {true, false, true, false, true, false, true, false}, + {4, 2}); // Tile2D_2Axes - RunTest({true, false, true, false}, {2, 2}, {2, 2}, {2}, {true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4}); + RunTestForBool( + {true, false, true, false}, {2, 2}, {2, 2}, {2}, + {true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4}); // Tile3D - RunTest({true, false, true, false, true, false}, {2, 1, 3}, {1, 2, 1}, {3}, {true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3}); + RunTestForBool({true, false, true, false, true, false}, {2, 1, 3}, {1, 2, 1}, {3}, + {true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3}); // Tile1DWithOneRepeats - RunTest({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3}, {true, false, true, false, true, true}, {2, 1, 3}); + RunTestForBool({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3}, + {true, false, true, false, true, true}, {2, 1, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 1 // This will trigger the MemCpy optimization path - RunTest({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3}, {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}); + RunTestForBool({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3}, + {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 2 // This will trigger the MemCpy optimization path - RunTest({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3}, {true, false, true, true, false, true, true, false, true}, {3, 1, 3}); + RunTestForBool({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3}, + {true, false, true, true, false, true, true, false, true}, {3, 1, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1) // This will trigger the (Batched) MemCpy optimization path - RunTest({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3}, - {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}); + RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3}, + {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1) // This will trigger the (Batched) MemCpy optimization path - RunTest({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3}, - {true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true}, - {4, 2, 3}); + RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3}, + {true, false, true, true, false, true, true, false, true, true, false, true, + true, false, true, true, false, true, true, false, true, true, false, true}, + {4, 2, 3}); } -TEST(TensorOpTest, TileFloatType) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileFloatType) { RunTestWrapper(); } -TEST(TensorOpTest, TileDoubleType) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileDoubleType) { RunTestWrapper(); } -TEST(TensorOpTest, TileInt8Type) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileInt8Type) { RunTestWrapper(); } -TEST(TensorOpTest, TileUint8Type) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileUint8Type) { RunTestWrapper(); } -TEST(TensorOpTest, TileInt16Type) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileInt16Type) { RunTestWrapper(); } -TEST(TensorOpTest, TileUint16Type) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileUint16Type) { RunTestWrapper(); } -TEST(TensorOpTest, TileInt32Type) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileInt32Type) { RunTestWrapper(); } -TEST(TensorOpTest, TileUint32Type) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileUint32Type) { RunTestWrapper(); } -TEST(TensorOpTest, TileInt64Type) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileInt64Type) { RunTestWrapper(); } -TEST(TensorOpTest, TileUint64Type) { - RunTestWrapper(); -} +TEST(TensorOpTest, TileUint64Type) { RunTestWrapper(); } + +TEST(TensorOpTest, TileBoolType) { RunTestWrapperForBool(); } + +#if defined(USE_CUDA) || defined(USE_ROCM) +TEST(TensorOpTest, TileMLFloat16Type) { RunTestWrapper(); } +#endif -TEST(TensorOpTest, TileBoolType) { - RunTestWrapper(); -} } // namespace test } // namespace onnxruntime