From b95fd4e644775a4343c13435bd729bd64f411752 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Wed, 27 Mar 2024 13:32:36 -0700
Subject: [PATCH] Enable CUDA EP unit testing on Windows (#20039)

### Description
Address build issues and source code discrepancies.
Fix cuda_test_provider gtest argument stack corruption.

### Motivation and Context
`OpTester` class that is widely used for kernel testing is not
suitable for testing internal classes for EPs that are built as shared
objects.
Currently, CUDA EP tests run only on Linux.
We want to enable testing and developments on Windows,
and create a usable pattern for testing of other EPs internals.

Alternatives considered:
Abstracting EP unit tests into separate test executable such as
`onnxruntime_test_all`.
This alternative was rejected as it would create a lot more changes in
the established patterns,
and potentially interfere with CUDA functionality with more complex
source code maintanence.
---
 cmake/CMakeLists.txt                          |   2 +-
 cmake/onnxruntime_providers_cuda.cmake        |   2 +-
 cmake/onnxruntime_unittests.cmake             |   7 +
 .../core/framework/execution_provider.h       |   2 +-
 .../onnxruntime/core/framework/run_options.h  |   2 +-
 .../core/mickey/blk_q4/f16_prepack_sm80.h     |  16 +-
 .../threadblock/quantb_mma_multistage.h       |   2 +-
 .../quantb_meta_mma_tensor_op_tile_iterator.h |  25 ++-
 .../shared_library/provider_wrappedtypes.h    |  15 +-
 onnxruntime/core/util/matrix_layout.h         |   2 +-
 .../test/cuda_host/blkq4_fp16_quant_sm80.h    |   7 +-
 .../cuda/test_cases/beam_search_topk.cc       |   3 +-
 .../cuda/test_cases/blkq4_fp16_gemm_sm80.h    |  24 +--
 .../test_cases/blkq4_fp16_gemm_sm80_test.cc   |  16 +-
 .../test_cases/blkq4_fp16_gemm_sm80_testcu.cu | 187 +++++++++---------
 .../cuda_execution_provider_test.cc           |  15 +-
 .../cuda/test_cases/cuda_test_provider.cc     |  32 ++-
 .../cuda/test_cases/cuda_utils_test.cc        |   7 +-
 .../cuda/test_cases/gemm_options_test.cc      |   4 +-
 .../test_cases/reduction_functions_test.cc    |   4 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |   4 +-
 .../azure-pipelines/win-gpu-ci-pipeline.yml   |  18 +-
 22 files changed, 219 insertions(+), 177 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ee1959bb35..3293506141 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -76,7 +76,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through
 # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical
 # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
-cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
 
 option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
 option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index aeeac10ead..1346a9ce96 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -122,7 +122,7 @@
   endif()
   if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
     # cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and
-    # add to the lib onnxruntime_providers_cuda separatedly.
+    # added to the lib onnxruntime_providers_cuda separately.
     # onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc.
     set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc)
     list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src})
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 1ffb838328..4a351dcf90 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -779,6 +779,13 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
   target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
   target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
+  if (MSVC)
+    # Cutlass code has an issue with the following:
+    # warning C4100: 'magic': unreferenced formal parameter
+    target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4100>"
+                  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4100>")
+  endif()
+
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
 endif()
 
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 40ca96a19a..16ad943a5f 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -53,7 +53,7 @@ struct NodeComputeInfo {
   DestroyFunctionStateFunc release_state_func;
 };
 
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
 
 enum class DataLayout {
   NCHW,
diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h
index 5444c825d7..789c3b13f2 100644
--- a/include/onnxruntime/core/framework/run_options.h
+++ b/include/onnxruntime/core/framework/run_options.h
@@ -45,5 +45,5 @@ struct OrtRunOptions {
 };
 
 namespace onnxruntime {
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
index a08cfb97ee..c81b4967d2 100644
--- a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
+++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
@@ -110,8 +110,8 @@ struct BlockwiseQuantization {
   static void prepack_weights(
       int rows,
       int columns,
-      const gsl::span<uint8_t const>& weights,     // <- int4 weights, column major
-      const gsl::span<uint8_t>& weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
+      gsl::span<uint8_t const> weights,     // <- int4 weights, column major
+      gsl::span<uint8_t> weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
   ) {
     ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 &&
                     (rows % QuantBlocking::kRow) == 0 &&
@@ -171,10 +171,10 @@ struct BlockwiseQuantization {
   static void prepack_quant_scales(
       size_t rows,
       size_t columns,
-      const gsl::span<ElementT const>& scales,     // <- quant scales, column major layout
-      const gsl::span<ElementT>& scales_prepacked  // <- quant scales prepacked, same size buffer
+      gsl::span<ElementT const> scales,     // <- quant scales, column major layout
+      gsl::span<ElementT> scales_prepacked  // <- quant scales prepacked, same size buffer
   ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
     ORT_ENFORCE(scales.size() == size_t(meta_shape.product()),
                 "Quantization scale tensor shape mismatch!");
     ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()),
@@ -241,10 +241,10 @@ struct BlockwiseQuantization {
   static void prepack_quant_offsets(
       size_t rows,
       size_t columns,
-      const gsl::span<uint8_t const>& offsets,     // <- quant offsets, int4, column major layout
-      const gsl::span<uint8_t>& offsets_prepacked  // <- quant offsets prepacked, double size buffer
+      gsl::span<uint8_t const> offsets,     // <- quant offsets, int4, column major layout
+      gsl::span<uint8_t> offsets_prepacked  // <- quant offsets prepacked, double size buffer
   ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
 
     ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0,
                 "Does not support odd number of rows or columns!");
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
index 8b6bac8c50..28364cc34f 100644
--- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
@@ -132,7 +132,7 @@ struct DummyType{
   }
 
   CUTLASS_HOST_DEVICE
-  std::monostate& operator[](int idx) {
+  std::monostate& operator[](int /*idx */) {
     return dummy_;
   }
 };
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
index 4ba39dda3d..26239161cf 100644
--- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
@@ -437,7 +437,7 @@ public:
 
   CUTLASS_HOST_DEVICE
   static void dequant(FragmentScale const &scales,
-                      FragmentOffset const &offsets,
+                      FragmentOffset const &fragment_offsets,
                       Array<uint8_t,kExpandedSize/2> const &weights,
                       Array<ElementScale, kExpandedSize>& dest){
     static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm.");
@@ -453,19 +453,18 @@ public:
 
       uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
       const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
-      const ElementOffset* offsets_ptr = nullptr;
-      if constexpr(kHasOffset) { offsets_ptr = offsets.data(); }
+      [[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr;
+      if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); }
 
       CUTLASS_PRAGMA_UNROLL
       for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
         // dequantize: d = scale * (weight - offset)
         // to use FMA, d = scale * weight + (scale * (-offset))
 
-        b64 offsets;
-        if constexpr(kHasOffset){
-          const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets_ptr);
-
+        [[maybe_unused]] b64 offsets{0};
+        if constexpr(kHasOffset) {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          const uint32_t* p = reinterpret_cast<const uint32_t*>(fragment_offsets_ptr);
           asm volatile(
               "{\n\t"
               "  .reg  .b32    rb0, rb1;\n"      // b32 regs for fp16x2 mul operands
@@ -486,7 +485,7 @@ public:
           assert(0);
 #endif
 
-          offsets_ptr += 4;
+          fragment_offsets_ptr += 4;
         } else {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
           asm volatile(
@@ -541,7 +540,7 @@ public:
             int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
             ElementScale s = scales[idx];
             if constexpr(kHasOffset){
-              offset = s * static_cast<ElementScale>(-16 - int(offsets[idx]));
+              offset = s * static_cast<ElementScale>(-16 - static_cast<int>(fragment_offsets[idx]));
             } else {
               offset = s * static_cast<ElementScale>(-16-8);
             }
@@ -795,13 +794,13 @@ public:
         }
       }
     } else if constexpr (kMmaIterationsB % 2 == 0) {
-      const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
-      uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
-
       if constexpr (kHasOffset){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+        const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
+        uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
         // possible buffer over read 2 bytes here.
         const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets.data());
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
         asm volatile(
           "{\n\t"
           "  .reg  .b32    rb0, rb1, rb2;\n"
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index bdad18c7ed..3bb938c1a3 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -394,14 +394,6 @@ struct ConfigOptions final {
   PROVIDER_DISALLOW_ALL(ConfigOptions)
 };
 
-struct OrtRunOptions final {
-  const ConfigOptions& GetConfigOptions() const {
-    return g_host->RunOptions__GetConfigOptions(this);
-  }
-
-  PROVIDER_DISALLOW_ALL(OrtRunOptions)
-};
-
 struct ComputeCapability final {
   static std::unique_ptr<ComputeCapability> Create(std::unique_ptr<IndexedSubGraph> t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); }
   static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast<ComputeCapability*>(p)); }
@@ -1283,3 +1275,10 @@ template <>
 inline gsl::span<const int64_t> Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); }
 
 }  // namespace onnxruntime
+
+struct OrtRunOptions final {
+  const onnxruntime::ConfigOptions& GetConfigOptions() const {
+    return onnxruntime::g_host->RunOptions__GetConfigOptions(this);
+  }
+  PROVIDER_DISALLOW_ALL(OrtRunOptions)
+};
diff --git a/onnxruntime/core/util/matrix_layout.h b/onnxruntime/core/util/matrix_layout.h
index 783a29d8a2..43843da3fb 100644
--- a/onnxruntime/core/util/matrix_layout.h
+++ b/onnxruntime/core/util/matrix_layout.h
@@ -378,7 +378,7 @@ class MatrixRef {
   MatrixRef(
       NonConstMatrixRef const& ref,  ///< MatrixRef to non-const data
       /// SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
-      _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
+      [[maybe_unused]] _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
       ) : data_(ref.data()), shape_(ref.shape()), layout_(Layout::packed(ref.shape())) {}
 
   ORT_FORCEINLINE
diff --git a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
index 6ea8b55505..942b1c4d2c 100644
--- a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
+++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
@@ -74,7 +74,8 @@ inline void sm80_prepack_quant_scales_ref(
     int columns,
     const MatrixRef<ScaleElementT const, Layout, true>& tensor_scale,
     const MatrixRef<ScaleElementT, Layout, true>& tensor_scale_prepacked) {
-  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn),
+  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] ==
+                                                                             (columns / QuantBlocking::kColumn),
               "Unexpected tensor_scale shape! Expected: (",
               rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")");
   ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape());
@@ -84,7 +85,9 @@ inline void sm80_prepack_quant_scales_ref(
   //    2 B operand tiles per mma instruction stacked on k dimension
   //    (1,n) quantization blocking
   if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) {
-    ORT_THROW("sm80_prepack_quant_scales_ref should only be called for row-wise block quantization on 16b float values.");
+    ORT_THROW(
+        "sm80_prepack_quant_scales_ref should only be called for "
+        " row-wise block quantization on 16b float values.");
   }
 
   // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
index 9fecec9f7e..a0d115c41c 100644
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -80,7 +80,8 @@ TEST(TestBeamSearch, TopK) {
   std::vector<float> top_k_values_ref(batch_size * k);
   std::vector<int32_t> top_k_tokens_ref(batch_size * k);
   std::vector<int32_t> top_k_indices_ref(batch_size * k);
-  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k);
+  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size,
+                       beam_size, vocab_size, k);
 
   const int32_t max_vocab_parts = 128;
   size_t buffer_size = batch_x_beam_x_vocab * 4                                      // input
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
index bbe370675f..f0dfaf1a58 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@@ -14,12 +14,14 @@
 
 #pragma once
 
-#include <random>
+#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+
+#include <random>
+#include <thrust/host_vector.h>
 
-#include "core/util/matrix_layout.h"
 #include "core/common/common.h"
 #include "core/mickey/blk_q4/f16_prepack_sm80.h"
-#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+#include "core/util/matrix_layout.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -48,10 +50,10 @@ Status sm80_supported();
 template <typename ElementT, int block_size, bool col_blocking, bool has_offsets>
 inline void blkq4_weights_gen(
     int rows, int columns,
-    std::vector<ElementT>& dequants,
-    std::vector<uint8_t>& q_weights,
-    std::vector<ElementT>& q_scales,
-    std::vector<uint8_t>& q_zp) {
+    thrust::host_vector<ElementT>& dequants,
+    thrust::host_vector<uint8_t>& q_weights,
+    thrust::host_vector<ElementT>& q_scales,
+    thrust::host_vector<uint8_t>& q_zp) {
   using Base = onnxruntime::cuda::BlockwiseQuantization<
       ElementT,
       block_size,
@@ -74,7 +76,7 @@ inline void blkq4_weights_gen(
 
   const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
   const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
-  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+  [[maybe_unused]] const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
 
   //
   // For testing quantization and dequantization, it is not straight
@@ -120,9 +122,9 @@ inline void blkq4_weights_gen(
 
   q_scales.resize(meta_shape.product());
   for (size_t i = 0; i < q_scales.size(); i++) {
-    uint32_t v = dis(gen);
-    uint32_t m = (v % 63) + 1;
-    uint32_t e = (v >> 6) % 4;
+    uint32_t vl = dis(gen);
+    uint32_t m = (vl % 63) + 1;
+    uint32_t e = (vl >> 6) % 4;
     q_scales[i] = ElementT(m / static_cast<float>(1 << (2 + e)));
   }
   MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index e687ae73e6..e7fa0dae02 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -11,15 +11,15 @@
  *   well with CUTLASS headers.
  */
 
+#include "blkq4_fp16_gemm_sm80.h"
+
+#include "gtest/gtest.h"
+#include <thrust/host_vector.h>
 #include <random>
 
 #include "core/framework/float16.h"
 #include "core/mlas/inc/mlas_q4.h"
 
-#include "blkq4_fp16_gemm_sm80.h"
-
-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace test {
 
@@ -43,10 +43,10 @@ void testPrepack(int rows, int columns) {
   const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
   const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
 
-  std::vector<ElementW> q_weights;
-  std::vector<ElementT> q_scales;
-  std::vector<ElementQOffset> q_zp;
-  std::vector<ElementT> dequants;
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementT> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementT> dequants;
   onnxruntime::cuda::test::blkq4_weights_gen<ElementT, block_size, col_blocking, has_offset>(
       rows, columns, dequants, q_weights, q_scales, q_zp);
 
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
index 69c929d446..210c33933d 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@@ -11,9 +11,11 @@
  *   well with gtest headers.
  */
 
+#include "blkq4_fp16_gemm_sm80.h"
+
 #include <random>
-#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 
 #include "core/mickey/blk_q4/f16_gemm_sm80.h"
 
@@ -26,13 +28,11 @@
 
 #include "core/common/common.h"
 
-#include "blkq4_fp16_gemm_sm80.h"
-
 namespace onnxruntime {
-namespace cuda{
-namespace test{
+namespace cuda {
+namespace test {
 
-Status sm80_supported(){
+Status sm80_supported() {
   cudaDeviceProp props;
 
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
@@ -55,27 +55,25 @@ Status sm80_supported(){
  *        Copied directly from cutlass util/reference/device/gemm.h
  *        for the strange reason that compiler insists on asking
  *        for explicit stream argument in kernel launch.
-*/
+ */
 template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType
->
+    typename ElementA,
+    typename LayoutA,
+    typename ElementB,
+    typename LayoutB,
+    typename ElementC,
+    typename LayoutC,
+    typename ScalarType,
+    typename AccumulatorType>
 void compute_gemm_ref(
-  cutlass::gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  cutlass::TensorRef<ElementA, LayoutA> tensor_a,
-  cutlass::TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  cutlass::TensorRef<ElementC, LayoutC> tensor_c,
-  cutlass::TensorRef<ElementC, LayoutC> tensor_d,
-  AccumulatorType initial_accum = AccumulatorType(0)) {
-
+    cutlass::gemm::GemmCoord problem_size,
+    ScalarType alpha,
+    cutlass::TensorRef<ElementA, LayoutA> tensor_a,
+    cutlass::TensorRef<ElementB, LayoutB> tensor_b,
+    ScalarType beta,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_c,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_d,
+    AccumulatorType initial_accum = AccumulatorType(0)) {
   // Blocking structure potentially improves performance of reference implementation
   // with a minor increase in complexity.
   //
@@ -85,30 +83,27 @@ void compute_gemm_ref(
   dim3 block(16, 8);
 
   dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
-  );
+      (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+      (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn));
 
   // Launch a GEMM kernel
   cutlass::reference::device::kernel::Gemm<
-    cutlass::TensorRef<ElementA, LayoutA>,
-    cutlass::TensorRef<ElementB, LayoutB>,
-    cutlass::TensorRef<ElementC, LayoutC>,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    cutlass::multiply_add<AccumulatorType>,
-    cutlass::NumericConverter<ElementC, ScalarType>
-  ><<<grid, block, 0, 0>>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
+      cutlass::TensorRef<ElementA, LayoutA>,
+      cutlass::TensorRef<ElementB, LayoutB>,
+      cutlass::TensorRef<ElementC, LayoutC>,
+      ScalarType,
+      AccumulatorType,
+      OutputTile,
+      cutlass::multiply_add<AccumulatorType>,
+      cutlass::NumericConverter<ElementC, ScalarType>><<<grid, block, 0, 0>>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      tensor_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -117,28 +112,31 @@ void compute_gemm_ref(
 //
 
 template <
-  typename Element,
-  typename LayoutCutlass,
-  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
-  >
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass,
+                                                      cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
 __forceinline__
-MatrixRef<Element, Layout, true> make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
-  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
-                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+    MatrixRef<Element, Layout, true>
+    make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
   auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
-  auto* ptr = const_cast<typename std::remove_const<Element>::type *>(tensor.host_data());
+  auto* ptr = const_cast<typename std::remove_const<Element>::type*>(tensor.host_data());
   return MatrixRef<Element, Layout, true>(ptr, tensor.capacity(), shape);
 }
 
 template <
-  typename Element,
-  typename LayoutCutlass,
-  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
-  >
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
 __forceinline__
-MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
-  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
-                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+    MatrixRef<Element const, Layout, true>
+    make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
   auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
   return MatrixRef<Element const, Layout, true>(tensor.host_data(), tensor.capacity(), shape);
 }
@@ -147,7 +145,7 @@ MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<E
 // Invoking the kernel
 //
 
-template<
+template <
     int block_size,
     bool column_wise_blocking,
     bool small_m,
@@ -160,9 +158,9 @@ void run_blkq4_gemm(int m, int n, int k) {
 
   using ElementDequant = cutlass::half_t;
   using QuantBlocking =
-    typename std::conditional<column_wise_blocking,
-                     cutlass::MatrixShape<block_size, 1>,
-                     cutlass::MatrixShape<1, block_size>>::type;
+      typename std::conditional<column_wise_blocking,
+                                cutlass::MatrixShape<block_size, 1>,
+                                cutlass::MatrixShape<1, block_size>>::type;
 
   using GemmRunner = BlkQ4F16GemmImpl<ElementDequant, QuantBlocking, small_m, has_offsets>;
 
@@ -181,17 +179,18 @@ void run_blkq4_gemm(int m, int n, int k) {
   using LayoutInputQScale = typename GemmRunner::LayoutInputQScale;
 
   const cutlass::gemm::GemmCoord problem_size = {m, n, k};
-  const auto q_weight_shape = cutlass::make_Coord(problem_size.k()/2, problem_size.n());
-  const auto meta_shape = cutlass::make_Coord(problem_size.k()/QuantBlocking::kRow, problem_size.n()/QuantBlocking::kColumn);
+  const auto q_weight_shape = cutlass::make_Coord(problem_size.k() / 2, problem_size.n());
+  const auto meta_shape = cutlass::make_Coord(problem_size.k() / QuantBlocking::kRow, problem_size.n() /
+                                                                                          QuantBlocking::kColumn);
 
   //
   // Generate quantized and dequantizeed input matrix B [K, N]
   //
   static_assert(std::is_same<LayoutInputWPack, cutlass::layout::ColumnMajor>::value);
-  std::vector<ElementW> q_weights;
-  std::vector<ElementQScale> q_scales;
-  std::vector<ElementQOffset> q_zp;
-  std::vector<ElementDequant> dequants;
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementQScale> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementDequant> dequants;
   onnxruntime::cuda::test::blkq4_weights_gen<ElementDequant, block_size, column_wise_blocking, has_offsets>(
       problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp);
 
@@ -201,11 +200,11 @@ void run_blkq4_gemm(int m, int n, int k) {
       4,
       column_wise_blocking>;
 
-  std::vector<ElementW> packed_w(q_weight_shape.product());
+  thrust::host_vector<ElementW> packed_w(q_weight_shape.product());
   PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w);
-  std::vector<ElementQScale> packed_scales(meta_shape.product());
+  thrust::host_vector<ElementQScale> packed_scales(meta_shape.product());
   PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales);
-  std::vector<ElementQOffset> packed_zp;
+  thrust::host_vector<ElementQOffset> packed_zp;
   if constexpr (has_offsets) {
     packed_zp.resize(meta_shape.product());
     PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp);
@@ -240,16 +239,16 @@ void run_blkq4_gemm(int m, int n, int k) {
   //
   thrust::device_vector<ElementW> d_packed_w(packed_w);
   cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_W(
-    reinterpret_cast<ElementWPack const *>(d_packed_w.data().get()),
-    LayoutInputWPack::packed({problem_size.k()/2, problem_size.n()/2}));
+      reinterpret_cast<ElementWPack const*>(d_packed_w.data().get()),
+      LayoutInputWPack::packed({problem_size.k() / 2, problem_size.n() / 2}));
 
   thrust::device_vector<ElementQScale> d_packed_scales(packed_scales);
   cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_scales(
-    d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
+      d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
 
   thrust::device_vector<ElementQOffset> d_packed_zp(packed_zp);
   cutlass::TensorRef<ElementQOffset const, LayoutInputQScale> ref_zp(
-    d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
+      d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
 
   tensor_a.sync_device();
   tensor_c.sync_device();
@@ -257,16 +256,16 @@ void run_blkq4_gemm(int m, int n, int k) {
 
   // run GEMM
   cutlass::Status status;
-  if constexpr (has_offsets){
+  if constexpr (has_offsets) {
     status = GemmRunner::run(
-      nullptr, problem_size, tensor_a.device_ref(), ref_W,
-      ref_scales, ref_zp,
-      tensor_c.device_ref(), tensor_d.device_ref());
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales, ref_zp,
+        tensor_c.device_ref(), tensor_d.device_ref());
   } else {
     status = GemmRunner::run(
-      nullptr, problem_size, tensor_a.device_ref(), ref_W,
-      ref_scales,
-      tensor_c.device_ref(), tensor_d.device_ref());
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales,
+        tensor_c.device_ref(), tensor_d.device_ref());
   }
   ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status));
 
@@ -275,7 +274,7 @@ void run_blkq4_gemm(int m, int n, int k) {
   using LayoutInputB = cutlass::layout::ColumnMajor;
   thrust::device_vector<ElementInputB> d_dequants(dequants);
   cutlass::TensorRef<ElementInputB, LayoutInputB> ref_B(
-    d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
+      d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
       problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
                            // reference kernel
@@ -289,9 +288,9 @@ void run_blkq4_gemm(int m, int n, int k) {
   ElementComputeEpilogue beta = ElementComputeEpilogue(0);
 
   compute_gemm_ref<ElementInputA, LayoutInputA,
-               ElementInputB, LayoutInputB,
-               ElementOutput, LayoutOutput,
-               ElementComputeEpilogue, ElementAccumulator>(
+                   ElementInputB, LayoutInputB,
+                   ElementOutput, LayoutOutput,
+                   ElementComputeEpilogue, ElementAccumulator>(
       problem_size,
       alpha,
       tensor_a.device_ref(),
@@ -300,17 +299,17 @@ void run_blkq4_gemm(int m, int n, int k) {
       tensor_c.device_ref(),
       tensor_ref_d.device_ref());
 
-  // Wait for kernels to finish
+  //// Wait for kernels to finish
   cudaDeviceSynchronize();
 
-  // Copy output data from CUTLASS and reference kernel to host for comparison
+  //// Copy output data from CUTLASS and reference kernel to host for comparison
   tensor_d.sync_host();
   tensor_ref_d.sync_host();
 
-  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  //// Check if output from CUTLASS kernel and reference kernel are equal or not
   bool passed = cutlass::reference::host::TensorEquals(
-    tensor_d.host_view(),
-    tensor_ref_d.host_view());
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
   ORT_ENFORCE(passed, "Gemm kernel result wrong!");
 }
 
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index 8dfaaedcbb..72357ec7e0 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -5,11 +5,14 @@
 // extra code in the core of CUDA EP and that code may
 //  1. slow down performance critical applications and
 //  2. increase binary size of ORT.
-#include <iostream>
-#include "core/providers/cuda/cuda_execution_provider.h"
-#include "core/providers/cuda/cuda_allocator.h"
-#include "core/providers/cuda/cuda_stream_handle.h"
+
 #include "gtest/gtest.h"
+#include <iostream>
+
+#include "core/framework/run_options.h"
+#include "core/providers/cuda/cuda_allocator.h"
+#include "core/providers/cuda/cuda_execution_provider.h"
+#include "core/providers/cuda/cuda_stream_handle.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -22,7 +25,7 @@ TEST(TestDeferredRelease, WithArena) {
   CUDAExecutionProvider ep(info);
   AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0];
 
-  RunOptions run_opts;
+  onnxruntime::RunOptions run_opts;
   run_opts.run_tag = "log1";
   // Allocator for call cudaMallocHost and cudaFreeHost
   // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
@@ -54,7 +57,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
 
-  RunOptions run_opts;
+  onnxruntime::RunOptions run_opts;
   run_opts.run_tag = "log1";
 
   OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
index 96c1e17331..d8384b4327 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
@@ -6,12 +6,11 @@
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cuda/cuda_provider_options.h"
 
+#include "gtest/gtest.h"
 #include <memory>
 #include <chrono>
 
 #include "core/common/gsl.h"
-#include "gtest/gtest.h"
-
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_execution_provider_info.h"
 #include "core/providers/cuda/cuda_allocator.h"
@@ -64,8 +63,15 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
 
   void cuda__Impl_Cast(void*, const float*, double*, size_t) override {}
 
-  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
-  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
+  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode,
+                        const char* msg, const char* file, const int line) override {
+    return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName,
+                                      cudaError(successCode), msg, file, line);
+  }
+  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode,
+                     const char* msg, const char* file, const int line) override {
+    CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line);
+  }
 
   void CopyGpuToCpu(void*, const void*, const size_t, const OrtMemoryInfo&, const OrtMemoryInfo&) override {}
 
@@ -93,19 +99,27 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
     return nullptr;
   }
 
-  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, const OrtArenaCfg*) override {
+  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy,
+                                                  onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&,
+                                                  const OrtArenaCfg*) override {
     return nullptr;
   }
 
   void TestAll() override {
-    // TestAll is the entry point of CUDA EP's insternal tests.
+    // TestAll is the entry point of CUDA EP's internal tests.
     // Those internal tests are not directly callable from onnxruntime_test_all
     // because CUDA EP is a shared library now.
     // Instead, this is a test provider that implements all the test cases.
     // onnxruntime_test_all is calling this function through TryGetProviderInfo_CUDA_Test.
-    int argc = 1;
-    std::string mock_exe_name = "onnxruntime_providers_cuda_ut";
-    char* argv[] = {const_cast<char*>(mock_exe_name.data())};
+    char mock_exe_name[] = "onnxruntime_providers_cuda_ut";
+
+    // InitGoogleTest decrements argc and removes args from argv if
+    // recognized. By doing so it decrements argc and shifts argv,
+    // to do so, from the code comments it expects argc + 1 with the last one always being nullptr
+    // otherwise, windows diagnostics reports stack corruption. when
+    int argc = 1;  // Change argc to 2 and edit the filter below if necessary
+    char* argv[] = {mock_exe_name, nullptr};
+    // char* argv[] = {mock_exe_name, "--gtest_filter=ReductionFunctionsTest.*", nullptr};
     ::testing::InitGoogleTest(&argc, argv);
     ORT_ENFORCE(RUN_ALL_TESTS() == 0);
   }
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
index 9d20bc545d..7468a57184 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gtest/gtest.h"
+
 #include <memory>
 #include <vector>
 
-#include "gtest/gtest.h"
-
 #include "core/common/common.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
@@ -32,7 +32,8 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
   Fill<TElement>(nullptr, buffer.get(), value, num_elements);
 
   auto cpu_buffer = std::make_unique<TElement[]>(num_elements);
-  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost));
+  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement),
+                             cudaMemcpyKind::cudaMemcpyDeviceToHost));
 
   std::vector<TElement> expected_data(num_elements, value);
   EXPECT_EQ(std::memcmp(cpu_buffer.get(), expected_data.data(), num_elements * sizeof(TElement)), 0);
diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
index 4917701e51..6636e15040 100644
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gtest/gtest.h"
+
 #include "core/common/common.h"
 #include "core/providers/cuda/cuda_common.h"
 
-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace cuda {
 namespace test {
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index c460e806c1..ec7e985285 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <memory>
-
 #include "gtest/gtest.h"
 
+#include <memory>
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/common/optional.h"
 #include "core/providers/cuda/reduction/reduction_functions.h"
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index b7232e9dc4..0e885b71b4 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -150,7 +150,9 @@ stages:
               --enable_cuda_profiling --enable_cuda_nhwc_ops \
               --enable_pybind --build_java \
               --use_cache \
-              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
+              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75 \
+              --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \
+              --cmake_extra_defines  onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \
                 ccache -sv; \
                 ccache -z"
       workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index eee38ac04b..291e2f4e19 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -42,7 +42,12 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_cuda_profiling
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
@@ -59,7 +64,10 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --skip_onnx_tests
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
@@ -95,7 +103,11 @@ stages:
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
-        additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
+        additionalBuildFlags: >-
+          --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
+          --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo