From 0bad5b1b5ab9277108f3bbcbe5c62d60a67bbd5f Mon Sep 17 00:00:00 2001
From: Vincent Wang <wangwchpku@outlook.com>
Date: Sat, 16 Apr 2022 07:46:43 +0800
Subject: [PATCH] [CUDA] Rollback TileMemcpy and TileBatchedMemcpy when Block
 Size is Small (#11187)

---
 .../core/providers/cuda/tensor/tile_impl.cu   | 172 ++++++++---
 .../test/providers/cpu/tensor/tile_op_test.cc | 275 +++++++++---------
 2 files changed, 272 insertions(+), 175 deletions(-)
diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu
index 29632444b7..e3ef2965c5 100644
--- a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu
@@ -33,9 +33,9 @@ __global__ void _UnRolledTileKernel(const size_t shape_rank, const TArray<fast_d
           break;
         }
 
-        int q, r;
-        fdm_output_strides[dim].divmod(offset, q, r);
-        int in_coord = fdm_input_shape[dim].mod(q);
+        int out_coord, r;
+        fdm_output_strides[dim].divmod(offset, out_coord, r);
+        int in_coord = fdm_input_shape[dim].mod(out_coord);
         input_index += input_strides[dim] * in_coord;
         offset = r;
       }
@@ -66,7 +66,32 @@ void TileImpl(cudaStream_t stream, const size_t shape_rank, const TArray<fast_di
 }
 
 template <typename T>
-__global__ void _TileMemcpyKernel(const T* input_data, T* output_data, const CUDA_LONG N, const size_t repeats) {
+__global__ void _TileMemcpyKernelFromOutput(const T* input_data, T* output_data,
+                                            const fast_divmod divmod_num_input_elements, const CUDA_LONG N) {
+  CUDA_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x;
+  T value[num_elements_per_thread];
+  CUDA_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < num_elements_per_thread; ++i) {
+    if (id < N) {
+      value[i] = input_data[divmod_num_input_elements.mod(id)];
+      id += num_threads_per_block;
+    }
+  }
+
+  id = start;
+#pragma unroll
+  for (int i = 0; i < num_elements_per_thread; ++i) {
+    if (id < N) {
+      output_data[id] = value[i];
+      id += num_threads_per_block;
+    }
+  }
+}
+
+template <typename T>
+__global__ void _TileMemcpyKernelFromInput(const T* input_data, T* output_data, const CUDA_LONG N,
+                                           const size_t repeats) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   T input_val = input_data[id];
   for (size_t i = 0; i < repeats; ++i) {
@@ -76,42 +101,94 @@ __global__ void _TileMemcpyKernel(const T* input_data, T* output_data, const CUD
 }
 
 template <typename T>
-void TileMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t num_input_elements,
-                    const size_t repeats) {
-  using Vec4T = aligned_vector<T, 4>;
-  using Vec2T = aligned_vector<T, 2>;
-  constexpr int vec4_alignment = std::alignment_of<Vec4T>::value;
-  constexpr int vec2_alignment = std::alignment_of<Vec2T>::value;
-  uint64_t address_input = reinterpret_cast<uint64_t>(input_data);
-  uint64_t address_output = reinterpret_cast<uint64_t>(output_data);
-  CUDA_LONG N = static_cast<CUDA_LONG>(num_input_elements);
-  if (num_input_elements % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) {
+size_t GetVectorizedSize(size_t num_input_elements, size_t num_elements_per_batch, uint64_t address_input,
+                         uint64_t address_output, CUDA_LONG& N, int& blocksPerGrid) {
+  constexpr int vec4_alignment = std::alignment_of<aligned_vector<T, 4>>::value;
+  constexpr int vec2_alignment = std::alignment_of<aligned_vector<T, 2>>::value;
+  N = static_cast<CUDA_LONG>(num_input_elements);
+  size_t vectorized_size = 1;
+  if (num_elements_per_batch % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) {
     N /= 4;
-    int blocksPerGrid = CeilDiv(N, num_threads_per_block);
-    _TileMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-        reinterpret_cast<const Vec4T*>(input_data), reinterpret_cast<Vec4T*>(output_data), N, repeats);
-    return;
-  } else if (num_input_elements % 2 == 0 && address_input % vec2_alignment == 0 &&
+    vectorized_size = 4;
+  } else if (num_elements_per_batch % 2 == 0 && address_input % vec2_alignment == 0 &&
              address_output % vec2_alignment == 0) {
     N /= 2;
-    int blocksPerGrid = CeilDiv(N, num_threads_per_block);
-    _TileMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+    vectorized_size = 2;
+  }
+  blocksPerGrid = CeilDiv(N, num_threads_per_block);
+  return vectorized_size;
+}
+
+template <typename T>
+void TileMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t num_input_elements,
+                    const size_t repeats) {
+  // If the block number from input size is too small to fill all streaming multiprocessors,
+  // it won't have perf gain to launch from inputs. In this case we will use the output based kernel.
+  CUDA_LONG N;
+  int blocksPerGrid;
+  size_t vectorized_size =
+      GetVectorizedSize<T>(num_input_elements, num_input_elements, reinterpret_cast<uint64_t>(input_data),
+                           reinterpret_cast<uint64_t>(output_data), N, blocksPerGrid);
+  if (blocksPerGrid < 128) {
+    N = static_cast<CUDA_LONG>(num_input_elements * repeats);
+    blocksPerGrid = CeilDiv(N, num_threads_per_block * num_elements_per_thread);
+    _TileMemcpyKernelFromOutput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+        input_data, output_data, fast_divmod(static_cast<int>(num_input_elements)), N);
+    return;
+  }
+
+  if (vectorized_size == 4) {
+    using Vec4T = aligned_vector<T, 4>;
+    _TileMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+        reinterpret_cast<const Vec4T*>(input_data), reinterpret_cast<Vec4T*>(output_data), N, repeats);
+    return;
+  } else if (vectorized_size == 2) {
+    using Vec2T = aligned_vector<T, 2>;
+    _TileMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
         reinterpret_cast<const Vec2T*>(input_data), reinterpret_cast<Vec2T*>(output_data), N, repeats);
     return;
   }
 
-  int blocksPerGrid = CeilDiv(N, num_threads_per_block);
-  _TileMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(input_data, output_data, N, repeats);
+  _TileMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(input_data, output_data, N, repeats);
+}
+
+template <typename T>
+__global__ void _TileBatchedMemcpyKernelFromOutput(const T* input_data, T* output_data,
+                                                   const fast_divmod divmod_size_output_row,
+                                                   const size_t size_input_row, const fast_divmod divmod_batch,
+                                                   const fast_divmod divmod_size_input_row, const CUDA_LONG N) {
+  CUDA_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x;
+  T value[num_elements_per_thread];
+  CUDA_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < num_elements_per_thread; ++i) {
+    if (id < N) {
+      int batch_idx, element_idx;
+      divmod_size_output_row.divmod(id, batch_idx, element_idx);
+      value[i] = input_data[divmod_batch.mod(batch_idx) * size_input_row + divmod_size_input_row.mod(element_idx)];
+      id += num_threads_per_block;
+    }
+  }
+
+  id = start;
+#pragma unroll
+  for (int i = 0; i < num_elements_per_thread; ++i) {
+    if (id < N) {
+      output_data[id] = value[i];
+      id += num_threads_per_block;
+    }
+  }
 }
 
 // Input size is [batch, data], output size is [batch * batch_repeats, data * repeats_per_batch].
 // Here size_input_row = data, size_output_row = data * repeats_per_batch,
 // size_output_batch = batch * data * repeats_per_batch
 template <typename T>
-__global__ void _TileBatchedMemcpyKernel(const T* input_data, T* output_data, const fast_divmod divmod_size_input_row,
-                                         const CUDA_LONG size_input_row, const CUDA_LONG size_output_row,
-                                         const CUDA_LONG size_output_batch, const size_t batch_repeats,
-                                         const size_t repeats_per_batch, const CUDA_LONG N) {
+__global__ void _TileBatchedMemcpyKernelFromInput(const T* input_data, T* output_data,
+                                                  const fast_divmod divmod_size_input_row,
+                                                  const CUDA_LONG size_input_row, const CUDA_LONG size_output_row,
+                                                  const CUDA_LONG size_output_batch, const size_t batch_repeats,
+                                                  const size_t repeats_per_batch, const CUDA_LONG N) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   T input_val = input_data[id];
   CUDA_LONG q, r;
@@ -133,29 +210,37 @@ template <typename T>
 void TileBatchedMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_data, const size_t size_input_row,
                            const size_t num_input_elements, const size_t batch_repeats,
                            const size_t repeats_per_batch) {
-  using Vec4T = aligned_vector<T, 4>;
-  using Vec2T = aligned_vector<T, 2>;
-  constexpr int vec4_alignment = std::alignment_of<Vec4T>::value;
-  constexpr int vec2_alignment = std::alignment_of<Vec2T>::value;
-  uint64_t address_input = reinterpret_cast<uint64_t>(input_data);
-  uint64_t address_output = reinterpret_cast<uint64_t>(output_data);
+  // If the block number from input size is too small to fill all streaming multiprocessors,
+  // it won't have perf gain to launch from inputs. In this case we will use the output based kernel.
+  CUDA_LONG N;
+  int blocksPerGrid;
+  size_t vectorized_size =
+      GetVectorizedSize<T>(num_input_elements, size_input_row, reinterpret_cast<uint64_t>(input_data),
+                           reinterpret_cast<uint64_t>(output_data), N, blocksPerGrid);
+  if (blocksPerGrid < 128) {
+    N = static_cast<CUDA_LONG>(num_input_elements * batch_repeats * repeats_per_batch);
+    blocksPerGrid = CeilDiv(N, num_threads_per_block * num_elements_per_thread);
+    _TileBatchedMemcpyKernelFromOutput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+        input_data, output_data, fast_divmod(static_cast<int>(size_input_row * repeats_per_batch)), size_input_row,
+        fast_divmod(static_cast<int>(num_input_elements / size_input_row)),
+        fast_divmod(static_cast<int>(size_input_row)), N);
+    return;
+  }
+
   CUDA_LONG size_input_row_vec = static_cast<CUDA_LONG>(size_input_row);
-  CUDA_LONG N = static_cast<CUDA_LONG>(num_input_elements);
-  if (size_input_row % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) {
+  if (vectorized_size == 4) {
+    using Vec4T = aligned_vector<T, 4>;
     size_input_row_vec /= 4;
-    N /= 4;
-    int blocksPerGrid = CeilDiv(N, num_threads_per_block);
-    _TileBatchedMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+    _TileBatchedMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
         reinterpret_cast<const Vec4T*>(input_data), reinterpret_cast<Vec4T*>(output_data),
         fast_divmod(size_input_row_vec), size_input_row_vec,
         size_input_row_vec * static_cast<CUDA_LONG>(repeats_per_batch), N * static_cast<CUDA_LONG>(repeats_per_batch),
         batch_repeats, repeats_per_batch, N);
     return;
-  } else if (size_input_row % 2 == 0 && address_input % vec2_alignment == 0 && address_output % vec2_alignment == 0) {
+  } else if (vectorized_size == 2) {
+    using Vec2T = aligned_vector<T, 2>;
     size_input_row_vec /= 2;
-    N /= 2;
-    int blocksPerGrid = CeilDiv(N, num_threads_per_block);
-    _TileBatchedMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+    _TileBatchedMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
         reinterpret_cast<const Vec2T*>(input_data), reinterpret_cast<Vec2T*>(output_data),
         fast_divmod(size_input_row_vec), size_input_row_vec,
         size_input_row_vec * static_cast<CUDA_LONG>(repeats_per_batch), N * static_cast<CUDA_LONG>(repeats_per_batch),
@@ -163,8 +248,7 @@ void TileBatchedMemcpyImpl(cudaStream_t stream, const T* input_data, T* output_d
     return;
   }
 
-  int blocksPerGrid = static_cast<int>(CeilDiv(N, num_threads_per_block));
-  _TileBatchedMemcpyKernel<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+  _TileBatchedMemcpyKernelFromInput<<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
       input_data, output_data, fast_divmod(size_input_row_vec), size_input_row_vec,
       size_input_row_vec * static_cast<CUDA_LONG>(repeats_per_batch), N * static_cast<CUDA_LONG>(repeats_per_batch),
       batch_repeats, repeats_per_batch, N);
diff --git a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc
index 45c36417f5..773a0cb3e2 100644
--- a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc
@@ -8,210 +8,223 @@ namespace onnxruntime {
 namespace test {
 
 template <typename T>
-void RunTest(std::initializer_list<T> input,
-             std::initializer_list<int64_t> input_dims,
-             std::initializer_list<int64_t> repeat,
-             std::initializer_list<int64_t> repeat_dims,
-             std::initializer_list<T> output,
-             std::initializer_list<int64_t> output_dims) {
+std::vector<T> InputData(size_t size) {
+  std::vector<T> result(size);
+  for (size_t i = 0; i < size; i++) {
+    result[i] = static_cast<T>(i);
+  }
+  return result;
+}
+
+template <>
+std::vector<MLFloat16> InputData<MLFloat16>(size_t size) {
+  std::vector<MLFloat16> result(size);
+  for (size_t i = 0; i < size; i++) {
+    result[i] = MLFloat16(static_cast<float>(i));
+  }
+  return result;
+}
+
+template <typename T>
+void RunTest(const std::vector<int64_t>& input_dims, const std::vector<int64_t>& repeats) {
+  size_t input_size =
+      static_cast<size_t>(std::accumulate(input_dims.begin(), input_dims.end(), 1LL, std::multiplies<int64_t>()));
+  std::vector<T> input_data = InputData<T>(input_size);
+  size_t rank = input_dims.size();
+  std::vector<int64_t> repeats_dims(1);
+  repeats_dims[0] = static_cast<int64_t>(rank);
+  std::vector<int64_t> output_dims(rank);
+  for (size_t i = 0; i < rank; ++i) {
+    output_dims[i] = input_dims[i] * repeats[i];
+  }
+  size_t output_size =
+      static_cast<size_t>(std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies<int64_t>()));
+  std::vector<T> output_data(output_size);
+  std::vector<int64_t> input_strides(rank);
+  std::vector<int64_t> output_strides(rank);
+  input_strides[rank - 1] = output_strides[rank - 1] = 1;
+  if (rank > 1) {
+    for (size_t i = rank - 2;; --i) {
+      input_strides[i] = input_dims[i + 1] * input_strides[i + 1];
+      output_strides[i] = output_dims[i + 1] * output_strides[i + 1];
+      if (i == 0) break;
+    }
+  }
+  for (size_t i = 0; i < output_size; ++i) {
+    int64_t index = 0;
+    int64_t remain = static_cast<int64_t>(i);
+    for (size_t j = 0; j < rank; ++j) {
+      index += (((remain / output_strides[j]) % input_dims[j]) * input_strides[j]);
+      remain = remain % output_strides[j];
+    }
+    output_data[i] = input_data[static_cast<size_t>(index)];
+  }
   OpTester test("Tile");
-  test.AddInput<T>("input", input_dims, input);
-  test.AddInput<int64_t>("repeats", repeat_dims, repeat);
-  test.AddOutput<T>("output", output_dims, output);
-  if (std::is_same<T, int8_t>::value)
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT reports error: Assertion Error in makePaddedScale: 0 (regionRanges != nullptr)
-  else
+  test.AddInput<T>("input", input_dims, input_data);
+  test.AddInput<int64_t>("repeats", repeats_dims, repeats);
+  test.AddOutput<T>("output", output_dims, output_data);
+  if (std::is_same<T, int8_t>::value) {
+    // TensorRT reports error: Assertion Error in makePaddedScale: 0 (regionRanges != nullptr)
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  } else {
     test.Run();
+  }
 }
 
 template <typename T>
 void RunTestWrapper() {
   // Tile1DWithZeroRepeats
-  RunTest<T>({1, 2, 3}, {3}, {0}, {1}, {}, {0});
+  RunTest<T>({3}, {0});
 
   // Tile2DWithZeroRepeats
-  RunTest<T>({11, 12, 21, 22}, {2, 2}, {2, 0}, {2}, {}, {4, 0});
+  RunTest<T>({2, 2}, {2, 0});
 
   // Tile1D
-  RunTest<T>({1, 2, 3}, {3}, {3}, {1}, {1, 2, 3, 1, 2, 3, 1, 2, 3}, {9});
+  RunTest<T>({3}, {3});
 
   // Tile2D_1Axis
-  RunTest<T>({11, 12, 21, 22}, {2, 2}, {2, 1}, {2}, {11, 12, 21, 22, 11, 12, 21, 22}, {4, 2});
-  RunTest<T>({11, 12, 21, 22, 31, 32}, {2, 3}, {2, 1}, {2}, {11, 12, 21, 22, 31, 32, 11, 12, 21, 22, 31, 32}, {4, 3});
+  RunTest<T>({2, 2}, {2, 1});
+  RunTest<T>({2, 3}, {2, 1});
 
   // Tile2D_2Axes
-  RunTest<T>({11, 12, 21, 22}, {2, 2}, {2, 2}, {2}, {11, 12, 11, 12, 21, 22, 21, 22, 11, 12, 11, 12, 21, 22, 21, 22}, {4, 4});
-  RunTest<T>({11, 12, 13, 14, 21, 22, 23, 24}, {2, 4}, {2, 2}, {2},
-             {11, 12, 13, 14, 11, 12, 13, 14, 21, 22, 23, 24, 21, 22, 23, 24,
-              11, 12, 13, 14, 11, 12, 13, 14, 21, 22, 23, 24, 21, 22, 23, 24},
-             {4, 8});
+  RunTest<T>({2, 2}, {2, 2});
+  RunTest<T>({2, 4}, {2, 2});
 
   // Tile3D
-  RunTest<T>({111, 112, 113, 122, 123, 124}, {2, 1, 3}, {1, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 122, 123, 124, 122, 123, 124}, {2, 2, 3});
+  RunTest<T>({2, 1, 3}, {1, 2, 1});
 
   // Tile4D
-  RunTest<T>(
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},  // input
-      {1, 2, 3, 4},                                                                            // input dims
-      {2, 1, 2, 1},                                                                            // repeat
-      {4},                                                                                     // repeat dims
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},  // output
-      {2, 2, 6, 4}                                                                                       // output dims
-  );
+  RunTest<T>({1, 2, 3, 4}, {2, 1, 2, 1});
 
   // Tile5D
-  RunTest<T>(
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
-       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
-       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71},  // input
-      {2, 3, 2, 3, 2},                                                           // input dims
-      {2, 1, 2, 1, 2},                                                           // repeat
-      {5},                                                                       // repeat dims
-      {0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
-       8, 9, 10, 11, 10, 11, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,
-       6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15,
-       14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23,
-       12, 13, 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21,
-       20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29,
-       30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35, 24, 25, 24, 25, 26, 27,
-       26, 27, 28, 29, 28, 29, 30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35,
-       36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45,
-       44, 45, 46, 47, 46, 47, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41,
-       42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51,
-       50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59,
-       48, 49, 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57,
-       56, 57, 58, 59, 58, 59, 60, 61, 60, 61, 62, 63, 62, 63, 64, 65, 64, 65,
-       66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71, 60, 61, 60, 61, 62, 63,
-       62, 63, 64, 65, 64, 65, 66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71,
-       0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
-       8, 9, 10, 11, 10, 11, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,
-       6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15,
-       14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23,
-       12, 13, 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21,
-       20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29,
-       30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35, 24, 25, 24, 25, 26, 27,
-       26, 27, 28, 29, 28, 29, 30, 31, 30, 31, 32, 33, 32, 33, 34, 35, 34, 35,
-       36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45,
-       44, 45, 46, 47, 46, 47, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41, 40, 41,
-       42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51,
-       50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59,
-       48, 49, 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57,
-       56, 57, 58, 59, 58, 59, 60, 61, 60, 61, 62, 63, 62, 63, 64, 65, 64, 65,
-       66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71, 60, 61, 60, 61, 62, 63,
-       62, 63, 64, 65, 64, 65, 66, 67, 66, 67, 68, 69, 68, 69, 70, 71, 70, 71},  // output
-      {4, 3, 4, 3, 4}                                                            // output dims
-  );
+  RunTest<T>({2, 3, 2, 3, 2}, {2, 1, 2, 1, 2});
 
   // Tile1DWithOneRepeats
-  RunTest<T>({111, 112, 113, 122, 123, 124}, {2, 1, 3}, {1, 1, 1}, {3}, {111, 112, 113, 122, 123, 124}, {2, 1, 3});
+  RunTest<T>({2, 1, 3}, {1, 1, 1});
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 1
   // This will trigger the MemCpy optimization path
-  RunTest<T>({111, 112, 113}, {1, 1, 3}, {2, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113, 111, 112, 113}, {2, 2, 3});
+  RunTest<T>({1, 1, 3}, {2, 2, 1});
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 2
   // This will trigger the MemCpy optimization path
-  RunTest<T>({111, 112, 113}, {1, 1, 3}, {3, 1, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113}, {3, 1, 3});
+  RunTest<T>({1, 1, 3}, {3, 1, 1});
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1)
   // This will trigger the (Batched) MemCpy optimization path
-  RunTest<T>({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {1, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {2, 2, 3});
+  RunTest<T>({2, 1, 3}, {1, 2, 1});
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1)
   // This will trigger the (Batched) MemCpy optimization path
-  RunTest<T>({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {2, 2, 1}, {3},
-             {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13, 111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {4, 2, 3});
+  RunTest<T>({2, 1, 3}, {2, 2, 1});
+
+#if defined(USE_CUDA) || defined(USE_ROCM)
+  // _TileMemcpyKernelFromInput, vectorized 4
+  RunTest<T>({256, 512}, {3, 1});
+
+  // _TileMemcpyKernelFromInput, vectorized 2
+  RunTest<T>({258, 257}, {4, 1});
+
+  // _TileMemcpyKernelFromInput, non-vectorized
+  RunTest<T>({129, 257}, {5, 1});
+
+  // _TileBatchedMemcpyKernelFromInput, vectorized 4
+  RunTest<T>({512, 256}, {1, 3});
+
+  // _TileBatchedMemcpyKernelFromInput, vectorized 2
+  RunTest<T>({257, 258}, {2, 2});
+
+  // _TileBatchedMemcpyKernelFromInput, non-vectorized
+  RunTest<T>({129, 257}, {3, 2});
+#endif
 }
 
-template <>
-void RunTestWrapper<bool>() {
+// OpTester's AddInput and AddOutput do not support std::vector<bool>.
+void RunTestForBool(std::initializer_list<bool> input_data, std::initializer_list<int64_t> input_dims,
+                    std::initializer_list<int64_t> repeats, std::initializer_list<int64_t> repeats_dims,
+                    std::initializer_list<bool> output_data, std::initializer_list<int64_t> output_dims) {
+  OpTester test("Tile");
+  test.AddInput<bool>("input", input_dims, input_data);
+  test.AddInput<int64_t>("repeats", repeats_dims, repeats);
+  test.AddOutput<bool>("output", output_dims, output_data);
+  test.Run();
+}
+
+void RunTestWrapperForBool() {
   // Tile1DWithZeroRepeats
-  RunTest<bool>({true, false, true}, {3}, {0}, {1}, {}, {0});
+  RunTestForBool({true, false, true}, {3}, {0}, {1}, {}, {0});
 
   // Tile2DWithZeroRepeats
-  RunTest<bool>({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0});
+  RunTestForBool({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0});
 
   // Tile1D
-  RunTest<bool>({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9});
+  RunTestForBool({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9});
 
   // Tile2D_1Axis
-  RunTest<bool>({true, false, true, false}, {2, 2}, {2, 1}, {2}, {true, false, true, false, true, false, true, false}, {4, 2});
+  RunTestForBool({true, false, true, false}, {2, 2}, {2, 1}, {2}, {true, false, true, false, true, false, true, false},
+                 {4, 2});
 
   // Tile2D_2Axes
-  RunTest<bool>({true, false, true, false}, {2, 2}, {2, 2}, {2}, {true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4});
+  RunTestForBool(
+      {true, false, true, false}, {2, 2}, {2, 2}, {2},
+      {true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4});
 
   // Tile3D
-  RunTest<bool>({true, false, true, false, true, false}, {2, 1, 3}, {1, 2, 1}, {3}, {true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3});
+  RunTestForBool({true, false, true, false, true, false}, {2, 1, 3}, {1, 2, 1}, {3},
+                 {true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3});
 
   // Tile1DWithOneRepeats
-  RunTest<bool>({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3}, {true, false, true, false, true, true}, {2, 1, 3});
+  RunTestForBool({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3},
+                 {true, false, true, false, true, true}, {2, 1, 3});
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 1
   // This will trigger the MemCpy optimization path
-  RunTest<bool>({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3}, {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
+  RunTestForBool({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3},
+                 {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 2
   // This will trigger the MemCpy optimization path
-  RunTest<bool>({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3}, {true, false, true, true, false, true, true, false, true}, {3, 1, 3});
+  RunTestForBool({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3},
+                 {true, false, true, true, false, true, true, false, true}, {3, 1, 3});
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1)
   // This will trigger the (Batched) MemCpy optimization path
-  RunTest<bool>({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3},
-                {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
+  RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3},
+                 {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1)
   // This will trigger the (Batched) MemCpy optimization path
-  RunTest<bool>({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3},
-                {true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true},
-                {4, 2, 3});
+  RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3},
+                 {true, false, true, true, false, true, true, false, true, true, false, true,
+                  true, false, true, true, false, true, true, false, true, true, false, true},
+                 {4, 2, 3});
 }
 
-TEST(TensorOpTest, TileFloatType) {
-  RunTestWrapper<float>();
-}
+TEST(TensorOpTest, TileFloatType) { RunTestWrapper<float>(); }
 
-TEST(TensorOpTest, TileDoubleType) {
-  RunTestWrapper<double>();
-}
+TEST(TensorOpTest, TileDoubleType) { RunTestWrapper<double>(); }
 
-TEST(TensorOpTest, TileInt8Type) {
-  RunTestWrapper<int8_t>();
-}
+TEST(TensorOpTest, TileInt8Type) { RunTestWrapper<int8_t>(); }
 
-TEST(TensorOpTest, TileUint8Type) {
-  RunTestWrapper<uint8_t>();
-}
+TEST(TensorOpTest, TileUint8Type) { RunTestWrapper<uint8_t>(); }
 
-TEST(TensorOpTest, TileInt16Type) {
-  RunTestWrapper<int16_t>();
-}
+TEST(TensorOpTest, TileInt16Type) { RunTestWrapper<int16_t>(); }
 
-TEST(TensorOpTest, TileUint16Type) {
-  RunTestWrapper<uint16_t>();
-}
+TEST(TensorOpTest, TileUint16Type) { RunTestWrapper<uint16_t>(); }
 
-TEST(TensorOpTest, TileInt32Type) {
-  RunTestWrapper<int32_t>();
-}
+TEST(TensorOpTest, TileInt32Type) { RunTestWrapper<int32_t>(); }
 
-TEST(TensorOpTest, TileUint32Type) {
-  RunTestWrapper<uint32_t>();
-}
+TEST(TensorOpTest, TileUint32Type) { RunTestWrapper<uint32_t>(); }
 
-TEST(TensorOpTest, TileInt64Type) {
-  RunTestWrapper<int64_t>();
-}
+TEST(TensorOpTest, TileInt64Type) { RunTestWrapper<int64_t>(); }
 
-TEST(TensorOpTest, TileUint64Type) {
-  RunTestWrapper<uint64_t>();
-}
+TEST(TensorOpTest, TileUint64Type) { RunTestWrapper<uint64_t>(); }
+
+TEST(TensorOpTest, TileBoolType) { RunTestWrapperForBool(); }
+
+#if defined(USE_CUDA) || defined(USE_ROCM)
+TEST(TensorOpTest, TileMLFloat16Type) { RunTestWrapper<MLFloat16>(); }
+#endif
 
-TEST(TensorOpTest, TileBoolType) {
-  RunTestWrapper<bool>();
-}
 }  // namespace test
 }  // namespace onnxruntime