[CUDA] Rollback TileMemcpy and TileBatchedMemcpy when Block Size is Small (#11187)

This commit is contained in:
Vincent Wang 2022-04-16 07:46:43 +08:00 committed by GitHub
parent d9eeb48393
commit 0bad5b1b5a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 272 additions and 175 deletions

View file

@ -33,9 +33,9 @@ __global__ void _UnRolledTileKernel(const size_t shape_rank, const TArray<fast_d
break;
}
int q, r;
fdm_output_strides[dim].divmod(offset, q, r);
int in_coord = fdm_input_shape[dim].mod(q);
int out_coord, r;
fdm_output_strides[dim].divmod(offset, out_coord, r);
int in_coord = fdm_input_shape[dim].mod(out_coord);
input_index += input_strides[dim] * in_coord;
offset = r;
}
@ -66,7 +66,32 @@ void TileImpl(cudaStream_t stream, const size_t shape_rank, const TArray<fast_di
}
template <typename T>
__global__ void _TileMemcpyKernel(const T* input_data, T* output_data, const CUDA_LONG N, const size_t repeats) {
__global__ void _TileMemcpyKernelFromOutput(const T* input_data, T* output_data,
const fast_divmod divmod_num_input_elements, const CUDA_LONG N) {
CUDA_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x;
T value[num_elements_per_thread];
CUDA_LONG id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
value[i] = input_data[divmod_num_input_elements.mod(id)];
id += num_threads_per_block;
}
}
id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
output_data[id] = value[i];
id += num_threads_per_block;
}
}
}
template <typename T>
__global__ void _TileMemcpyKernelFromInput(const T* input_data, T* output_data, const CUDA_LONG N,
const size_t repeats) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
T input_val = input_data[id];
for (size_t i = 0; i < repeats; ++i) {
@ -76,42 +101,94 @@ __global__ void _TileMemcpyKernel(const T* input_data, T* output_data, const CUD
}
template <typename T>
void TileMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t num_input_elements,
const size_t repeats) {
using Vec4T = aligned_vector<T, 4>;
using Vec2T = aligned_vector<T, 2>;
constexpr int vec4_alignment = std::alignment_of<Vec4T>::value;
constexpr int vec2_alignment = std::alignment_of<Vec2T>::value;
uint64_t address_input = reinterpret_cast<uint64_t>(input_data);
uint64_t address_output = reinterpret_cast<uint64_t>(output_data);
CUDA_LONG N = static_cast<CUDA_LONG>(num_input_elements);
if (num_input_elements % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) {
size_t GetVectorizedSize(size_t num_input_elements, size_t num_elements_per_batch, uint64_t address_input,
uint64_t address_output, CUDA_LONG& N, int& blocksPerGrid) {
constexpr int vec4_alignment = std::alignment_of<aligned_vector<T, 4>>::value;
constexpr int vec2_alignment = std::alignment_of<aligned_vector<T, 2>>::value;
N = static_cast<CUDA_LONG>(num_input_elements);
size_t vectorized_size = 1;
if (num_elements_per_batch % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) {
N /= 4;
int blocksPerGrid = CeilDiv(N, num_threads_per_block);
_TileMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
reinterpret_cast<const Vec4T*>(input_data), reinterpret_cast<Vec4T*>(output_data), N, repeats);
return;
} else if (num_input_elements % 2 == 0 && address_input % vec2_alignment == 0 &&
vectorized_size = 4;
} else if (num_elements_per_batch % 2 == 0 && address_input % vec2_alignment == 0 &&
address_output % vec2_alignment == 0) {
N /= 2;
int blocksPerGrid = CeilDiv(N, num_threads_per_block);
_TileMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
vectorized_size = 2;
}
blocksPerGrid = CeilDiv(N, num_threads_per_block);
return vectorized_size;
}
template <typename T>
void TileMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t num_input_elements,
const size_t repeats) {
// If the block number from input size is too small to fill all streaming multiprocessors,
// it won't have perf gain to launch from inputs. In this case we will use the output based kernel.
CUDA_LONG N;
int blocksPerGrid;
size_t vectorized_size =
GetVectorizedSize<T>(num_input_elements, num_input_elements, reinterpret_cast<uint64_t>(input_data),
reinterpret_cast<uint64_t>(output_data), N, blocksPerGrid);
if (blocksPerGrid < 128) {
N = static_cast<CUDA_LONG>(num_input_elements * repeats);
blocksPerGrid = CeilDiv(N, num_threads_per_block * num_elements_per_thread);
_TileMemcpyKernelFromOutput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
input_data, output_data, fast_divmod(static_cast<int>(num_input_elements)), N);
return;
}
if (vectorized_size == 4) {
using Vec4T = aligned_vector<T, 4>;
_TileMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
reinterpret_cast<const Vec4T*>(input_data), reinterpret_cast<Vec4T*>(output_data), N, repeats);
return;
} else if (vectorized_size == 2) {
using Vec2T = aligned_vector<T, 2>;
_TileMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
reinterpret_cast<const Vec2T*>(input_data), reinterpret_cast<Vec2T*>(output_data), N, repeats);
return;
}
int blocksPerGrid = CeilDiv(N, num_threads_per_block);
_TileMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(input_data, output_data, N, repeats);
_TileMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(input_data, output_data, N, repeats);
}
template <typename T>
__global__ void _TileBatchedMemcpyKernelFromOutput(const T* input_data, T* output_data,
const fast_divmod divmod_size_output_row,
const size_t size_input_row, const fast_divmod divmod_batch,
const fast_divmod divmod_size_input_row, const CUDA_LONG N) {
CUDA_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x;
T value[num_elements_per_thread];
CUDA_LONG id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
int batch_idx, element_idx;
divmod_size_output_row.divmod(id, batch_idx, element_idx);
value[i] = input_data[divmod_batch.mod(batch_idx) * size_input_row + divmod_size_input_row.mod(element_idx)];
id += num_threads_per_block;
}
}
id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
output_data[id] = value[i];
id += num_threads_per_block;
}
}
}
// Input size is [batch, data], output size is [batch * batch_repeats, data * repeats_per_batch].
// Here size_input_row = data, size_output_row = data * repeats_per_batch,
// size_output_batch = batch * data * repeats_per_batch
template <typename T>
__global__ void _TileBatchedMemcpyKernel(const T* input_data, T* output_data, const fast_divmod divmod_size_input_row,
const CUDA_LONG size_input_row, const CUDA_LONG size_output_row,
const CUDA_LONG size_output_batch, const size_t batch_repeats,
const size_t repeats_per_batch, const CUDA_LONG N) {
__global__ void _TileBatchedMemcpyKernelFromInput(const T* input_data, T* output_data,
const fast_divmod divmod_size_input_row,
const CUDA_LONG size_input_row, const CUDA_LONG size_output_row,
const CUDA_LONG size_output_batch, const size_t batch_repeats,
const size_t repeats_per_batch, const CUDA_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
T input_val = input_data[id];
CUDA_LONG q, r;
@ -133,29 +210,37 @@ template <typename T>
void TileBatchedMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t size_input_row,
const size_t num_input_elements, const size_t batch_repeats,
const size_t repeats_per_batch) {
using Vec4T = aligned_vector<T, 4>;
using Vec2T = aligned_vector<T, 2>;
constexpr int vec4_alignment = std::alignment_of<Vec4T>::value;
constexpr int vec2_alignment = std::alignment_of<Vec2T>::value;
uint64_t address_input = reinterpret_cast<uint64_t>(input_data);
uint64_t address_output = reinterpret_cast<uint64_t>(output_data);
// If the block number from input size is too small to fill all streaming multiprocessors,
// it won't have perf gain to launch from inputs. In this case we will use the output based kernel.
CUDA_LONG N;
int blocksPerGrid;
size_t vectorized_size =
GetVectorizedSize<T>(num_input_elements, size_input_row, reinterpret_cast<uint64_t>(input_data),
reinterpret_cast<uint64_t>(output_data), N, blocksPerGrid);
if (blocksPerGrid < 128) {
N = static_cast<CUDA_LONG>(num_input_elements * batch_repeats * repeats_per_batch);
blocksPerGrid = CeilDiv(N, num_threads_per_block * num_elements_per_thread);
_TileBatchedMemcpyKernelFromOutput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
input_data, output_data, fast_divmod(static_cast<int>(size_input_row * repeats_per_batch)), size_input_row,
fast_divmod(static_cast<int>(num_input_elements / size_input_row)),
fast_divmod(static_cast<int>(size_input_row)), N);
return;
}
CUDA_LONG size_input_row_vec = static_cast<CUDA_LONG>(size_input_row);
CUDA_LONG N = static_cast<CUDA_LONG>(num_input_elements);
if (size_input_row % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) {
if (vectorized_size == 4) {
using Vec4T = aligned_vector<T, 4>;
size_input_row_vec /= 4;
N /= 4;
int blocksPerGrid = CeilDiv(N, num_threads_per_block);
_TileBatchedMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
_TileBatchedMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
reinterpret_cast<const Vec4T*>(input_data), reinterpret_cast<Vec4T*>(output_data),
fast_divmod(size_input_row_vec), size_input_row_vec,
size_input_row_vec * static_cast<CUDA_LONG>(repeats_per_batch), N * static_cast<CUDA_LONG>(repeats_per_batch),
batch_repeats, repeats_per_batch, N);
return;
} else if (size_input_row % 2 == 0 && address_input % vec2_alignment == 0 && address_output % vec2_alignment == 0) {
} else if (vectorized_size == 2) {
using Vec2T = aligned_vector<T, 2>;
size_input_row_vec /= 2;
N /= 2;
int blocksPerGrid = CeilDiv(N, num_threads_per_block);
_TileBatchedMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
_TileBatchedMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
reinterpret_cast<const Vec2T*>(input_data), reinterpret_cast<Vec2T*>(output_data),
fast_divmod(size_input_row_vec), size_input_row_vec,
size_input_row_vec * static_cast<CUDA_LONG>(repeats_per_batch), N * static_cast<CUDA_LONG>(repeats_per_batch),
@ -163,8 +248,7 @@ void TileBatchedMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_d
return;
}
int blocksPerGrid = static_cast<int>(CeilDiv(N, num_threads_per_block));
_TileBatchedMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
_TileBatchedMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
input_data, output_data, fast_divmod(size_input_row_vec), size_input_row_vec,
size_input_row_vec * static_cast<CUDA_LONG>(repeats_per_batch), N * static_cast<CUDA_LONG>(repeats_per_batch),
batch_repeats, repeats_per_batch, N);

View file

@ -8,210 +8,223 @@ namespace onnxruntime {
namespace test {
template <typename T>
void RunTest(std::initializer_list<T> input,
std::initializer_list<int64_t> input_dims,
std::initializer_list<int64_t> repeat,
std::initializer_list<int64_t> repeat_dims,
std::initializer_list<T> output,
std::initializer_list<int64_t> output_dims) {
std::vector<T> InputData(size_t size) {
std::vector<T> result(size);
for (size_t i = 0; i < size; i++) {
result[i] = static_cast<T>(i);
}
return result;
}
template <>
std::vector<MLFloat16> InputData<MLFloat16>(size_t size) {
std::vector<MLFloat16> result(size);
for (size_t i = 0; i < size; i++) {
result[i] = MLFloat16(static_cast<float>(i));
}
return result;
}
template <typename T>
void RunTest(const std::vector<int64_t>& input_dims, const std::vector<int64_t>& repeats) {
size_t input_size =
static_cast<size_t>(std::accumulate(input_dims.begin(), input_dims.end(), 1LL, std::multiplies<int64_t>()));
std::vector<T> input_data = InputData<T>(input_size);
size_t rank = input_dims.size();
std::vector<int64_t> repeats_dims(1);
repeats_dims[0] = static_cast<int64_t>(rank);
std::vector<int64_t> output_dims(rank);
for (size_t i = 0; i < rank; ++i) {
output_dims[i] = input_dims[i] * repeats[i];
}
size_t output_size =
static_cast<size_t>(std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies<int64_t>()));
std::vector<T> output_data(output_size);
std::vector<int64_t> input_strides(rank);
std::vector<int64_t> output_strides(rank);
input_strides[rank - 1] = output_strides[rank - 1] = 1;
if (rank > 1) {
for (size_t i = rank - 2;; --i) {
input_strides[i] = input_dims[i + 1] * input_strides[i + 1];
output_strides[i] = output_dims[i + 1] * output_strides[i + 1];
if (i == 0) break;
}
}
for (size_t i = 0; i < output_size; ++i) {
int64_t index = 0;
int64_t remain = static_cast<int64_t>(i);
for (size_t j = 0; j < rank; ++j) {
index += (((remain / output_strides[j]) % input_dims[j]) * input_strides[j]);
remain = remain % output_strides[j];
}
output_data[i] = input_data[static_cast<size_t>(index)];
}
OpTester test("Tile");
test.AddInput<T>("input", input_dims, input);
test.AddInput<int64_t>("repeats", repeat_dims, repeat);
test.AddOutput<T>("output", output_dims, output);
if (std::is_same<T, int8_t>::value)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT reports error: Assertion Error in makePaddedScale: 0 (regionRanges != nullptr)
else
test.AddInput<T>("input", input_dims, input_data);
test.AddInput<int64_t>("repeats", repeats_dims, repeats);
test.AddOutput<T>("output", output_dims, output_data);
if (std::is_same<T, int8_t>::value) {
// TensorRT reports error: Assertion Error in makePaddedScale: 0 (regionRanges != nullptr)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
} else {
test.Run();
}
}
template <typename T>
void RunTestWrapper() {
// Tile1DWithZeroRepeats
RunTest<T>({1, 2, 3}, {3}, {0}, {1}, {}, {0});
RunTest<T>({3}, {0});
// Tile2DWithZeroRepeats
RunTest<T>({11, 12, 21, 22}, {2, 2}, {2, 0}, {2}, {}, {4, 0});
RunTest<T>({2, 2}, {2, 0});
// Tile1D
RunTest<T>({1, 2, 3}, {3}, {3}, {1}, {1, 2, 3, 1, 2, 3, 1, 2, 3}, {9});
RunTest<T>({3}, {3});
// Tile2D_1Axis
RunTest<T>({11, 12, 21, 22}, {2, 2}, {2, 1}, {2}, {11, 12, 21, 22, 11, 12, 21, 22}, {4, 2});
RunTest<T>({11, 12, 21, 22, 31, 32}, {2, 3}, {2, 1}, {2}, {11, 12, 21, 22, 31, 32, 11, 12, 21, 22, 31, 32}, {4, 3});
RunTest<T>({2, 2}, {2, 1});
RunTest<T>({2, 3}, {2, 1});
// Tile2D_2Axes
RunTest<T>({11, 12, 21, 22}, {2, 2}, {2, 2}, {2}, {11, 12, 11, 12, 21, 22, 21, 22, 11, 12, 11, 12, 21, 22, 21, 22}, {4, 4});
RunTest<T>({11, 12, 13, 14, 21, 22, 23, 24}, {2, 4}, {2, 2}, {2},
{11, 12, 13, 14, 11, 12, 13, 14, 21, 22, 23, 24, 21, 22, 23, 24,
11, 12, 13, 14, 11, 12, 13, 14, 21, 22, 23, 24, 21, 22, 23, 24},
{4, 8});
RunTest<T>({2, 2}, {2, 2});
RunTest<T>({2, 4}, {2, 2});
// Tile3D
RunTest<T>({111, 112, 113, 122, 123, 124}, {2, 1, 3}, {1, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 122, 123, 124, 122, 123, 124}, {2, 2, 3});
RunTest<T>({2, 1, 3}, {1, 2, 1});
// Tile4D
RunTest<T>(
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, // input
{1, 2, 3, 4}, // input dims
{2, 1, 2, 1}, // repeat
{4}, // repeat dims
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, // output
{2, 2, 6, 4} // output dims
);
RunTest<T>({1, 2, 3, 4}, {2, 1, 2, 1});
// Tile5D
RunTest<T>(
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71}, // input
{2, 3, 2, 3, 2}, // input dims
{2, 1, 2, 1, 2}, // repeat
{5}, // repeat dims
{0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
8, 9, 10, 11, 10, 11, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,
6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15,
14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23,
12, 13, 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21,
20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29,
30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35, 24, 25, 24, 25, 26, 27,
26, 27, 28, 29, 28, 29, 30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35,
36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45,
44, 45, 46, 47, 46, 47, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41,
42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51,
50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59,
48, 49, 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57,
56, 57, 58, 59, 58, 59, 60, 61, 60, 61, 62, 63, 62, 63, 64, 65, 64, 65,
66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71, 60, 61, 60, 61, 62, 63,
62, 63, 64, 65, 64, 65, 66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71,
0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
8, 9, 10, 11, 10, 11, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,
6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15,
14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23,
12, 13, 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21,
20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29,
30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35, 24, 25, 24, 25, 26, 27,
26, 27, 28, 29, 28, 29, 30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35,
36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45,
44, 45, 46, 47, 46, 47, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41,
42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51,
50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59,
48, 49, 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57,
56, 57, 58, 59, 58, 59, 60, 61, 60, 61, 62, 63, 62, 63, 64, 65, 64, 65,
66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71, 60, 61, 60, 61, 62, 63,
62, 63, 64, 65, 64, 65, 66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71}, // output
{4, 3, 4, 3, 4} // output dims
);
RunTest<T>({2, 3, 2, 3, 2}, {2, 1, 2, 1, 2});
// Tile1DWithOneRepeats
RunTest<T>({111, 112, 113, 122, 123, 124}, {2, 1, 3}, {1, 1, 1}, {3}, {111, 112, 113, 122, 123, 124}, {2, 1, 3});
RunTest<T>({2, 1, 3}, {1, 1, 1});
// TileWhichIsBasicallyCopiesOfInputBuffer - 1
// This will trigger the MemCpy optimization path
RunTest<T>({111, 112, 113}, {1, 1, 3}, {2, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113, 111, 112, 113}, {2, 2, 3});
RunTest<T>({1, 1, 3}, {2, 2, 1});
// TileWhichIsBasicallyCopiesOfInputBuffer - 2
// This will trigger the MemCpy optimization path
RunTest<T>({111, 112, 113}, {1, 1, 3}, {3, 1, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113}, {3, 1, 3});
RunTest<T>({1, 1, 3}, {3, 1, 1});
// TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1)
// This will trigger the (Batched) MemCpy optimization path
RunTest<T>({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {1, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {2, 2, 3});
RunTest<T>({2, 1, 3}, {1, 2, 1});
// TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1)
// This will trigger the (Batched) MemCpy optimization path
RunTest<T>({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {2, 2, 1}, {3},
{111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13, 111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {4, 2, 3});
RunTest<T>({2, 1, 3}, {2, 2, 1});
#if defined(USE_CUDA) || defined(USE_ROCM)
// _TileMemcpyKernelFromInput, vectorized 4
RunTest<T>({256, 512}, {3, 1});
// _TileMemcpyKernelFromInput, vectorized 2
RunTest<T>({258, 257}, {4, 1});
// _TileMemcpyKernelFromInput, non-vectorized
RunTest<T>({129, 257}, {5, 1});
// _TileBatchedMemcpyKernelFromInput, vectorized 4
RunTest<T>({512, 256}, {1, 3});
// _TileBatchedMemcpyKernelFromInput, vectorized 2
RunTest<T>({257, 258}, {2, 2});
// _TileBatchedMemcpyKernelFromInput, non-vectorized
RunTest<T>({129, 257}, {3, 2});
#endif
}
template <>
void RunTestWrapper<bool>() {
// OpTester's AddInput and AddOutput do not support std::vector<bool>.
void RunTestForBool(std::initializer_list<bool> input_data, std::initializer_list<int64_t> input_dims,
std::initializer_list<int64_t> repeats, std::initializer_list<int64_t> repeats_dims,
std::initializer_list<bool> output_data, std::initializer_list<int64_t> output_dims) {
OpTester test("Tile");
test.AddInput<bool>("input", input_dims, input_data);
test.AddInput<int64_t>("repeats", repeats_dims, repeats);
test.AddOutput<bool>("output", output_dims, output_data);
test.Run();
}
void RunTestWrapperForBool() {
// Tile1DWithZeroRepeats
RunTest<bool>({true, false, true}, {3}, {0}, {1}, {}, {0});
RunTestForBool({true, false, true}, {3}, {0}, {1}, {}, {0});
// Tile2DWithZeroRepeats
RunTest<bool>({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0});
RunTestForBool({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0});
// Tile1D
RunTest<bool>({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9});
RunTestForBool({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9});
// Tile2D_1Axis
RunTest<bool>({true, false, true, false}, {2, 2}, {2, 1}, {2}, {true, false, true, false, true, false, true, false}, {4, 2});
RunTestForBool({true, false, true, false}, {2, 2}, {2, 1}, {2}, {true, false, true, false, true, false, true, false},
{4, 2});
// Tile2D_2Axes
RunTest<bool>({true, false, true, false}, {2, 2}, {2, 2}, {2}, {true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4});
RunTestForBool(
{true, false, true, false}, {2, 2}, {2, 2}, {2},
{true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4});
// Tile3D
RunTest<bool>({true, false, true, false, true, false}, {2, 1, 3}, {1, 2, 1}, {3}, {true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3});
RunTestForBool({true, false, true, false, true, false}, {2, 1, 3}, {1, 2, 1}, {3},
{true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3});
// Tile1DWithOneRepeats
RunTest<bool>({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3}, {true, false, true, false, true, true}, {2, 1, 3});
RunTestForBool({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3},
{true, false, true, false, true, true}, {2, 1, 3});
// TileWhichIsBasicallyCopiesOfInputBuffer - 1
// This will trigger the MemCpy optimization path
RunTest<bool>({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3}, {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
RunTestForBool({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3},
{true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
// TileWhichIsBasicallyCopiesOfInputBuffer - 2
// This will trigger the MemCpy optimization path
RunTest<bool>({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3}, {true, false, true, true, false, true, true, false, true}, {3, 1, 3});
RunTestForBool({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3},
{true, false, true, true, false, true, true, false, true}, {3, 1, 3});
// TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1)
// This will trigger the (Batched) MemCpy optimization path
RunTest<bool>({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3},
{true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3},
{true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
// TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1)
// This will trigger the (Batched) MemCpy optimization path
RunTest<bool>({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3},
{true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true},
{4, 2, 3});
RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3},
{true, false, true, true, false, true, true, false, true, true, false, true,
true, false, true, true, false, true, true, false, true, true, false, true},
{4, 2, 3});
}
TEST(TensorOpTest, TileFloatType) {
RunTestWrapper<float>();
}
TEST(TensorOpTest, TileFloatType) { RunTestWrapper<float>(); }
TEST(TensorOpTest, TileDoubleType) {
RunTestWrapper<double>();
}
TEST(TensorOpTest, TileDoubleType) { RunTestWrapper<double>(); }
TEST(TensorOpTest, TileInt8Type) {
RunTestWrapper<int8_t>();
}
TEST(TensorOpTest, TileInt8Type) { RunTestWrapper<int8_t>(); }
TEST(TensorOpTest, TileUint8Type) {
RunTestWrapper<uint8_t>();
}
TEST(TensorOpTest, TileUint8Type) { RunTestWrapper<uint8_t>(); }
TEST(TensorOpTest, TileInt16Type) {
RunTestWrapper<int16_t>();
}
TEST(TensorOpTest, TileInt16Type) { RunTestWrapper<int16_t>(); }
TEST(TensorOpTest, TileUint16Type) {
RunTestWrapper<uint16_t>();
}
TEST(TensorOpTest, TileUint16Type) { RunTestWrapper<uint16_t>(); }
TEST(TensorOpTest, TileInt32Type) {
RunTestWrapper<int32_t>();
}
TEST(TensorOpTest, TileInt32Type) { RunTestWrapper<int32_t>(); }
TEST(TensorOpTest, TileUint32Type) {
RunTestWrapper<uint32_t>();
}
TEST(TensorOpTest, TileUint32Type) { RunTestWrapper<uint32_t>(); }
TEST(TensorOpTest, TileInt64Type) {
RunTestWrapper<int64_t>();
}
TEST(TensorOpTest, TileInt64Type) { RunTestWrapper<int64_t>(); }
TEST(TensorOpTest, TileUint64Type) {
RunTestWrapper<uint64_t>();
}
TEST(TensorOpTest, TileUint64Type) { RunTestWrapper<uint64_t>(); }
TEST(TensorOpTest, TileBoolType) { RunTestWrapperForBool(); }
#if defined(USE_CUDA) || defined(USE_ROCM)
TEST(TensorOpTest, TileMLFloat16Type) { RunTestWrapper<MLFloat16>(); }
#endif
TEST(TensorOpTest, TileBoolType) {
RunTestWrapper<bool>();
}
} // namespace test
} // namespace onnxruntime