Enable CUDA EP unit testing on Windows (#20039)

### Description Address build issues and source code discrepancies. Fix cuda_test_provider gtest argument stack corruption. ### Motivation and Context `OpTester` class that is widely used for kernel testing is not suitable for testing internal classes for EPs that are built as shared objects. Currently, CUDA EP tests run only on Linux. We want to enable testing and developments on Windows, and create a usable pattern for testing of other EPs internals. Alternatives considered: Abstracting EP unit tests into separate test executable such as `onnxruntime_test_all`. This alternative was rejected as it would create a lot more changes in the established patterns, and potentially interfere with CUDA functionality with more complex source code maintanence.
2026-07-06 04:28:32 +00:00 · 2024-03-27 13:32:36 -07:00 · 2024-03-27 13:32:36 -07:00 · b95fd4e644
commit b95fd4e644
parent ab2eaedfaa
22 changed files with 219 additions and 177 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -76,7 +76,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through
 # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical
 # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
-cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)

 option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
 option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@ -122,7 +122,7 @@
  endif()
  if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
    # cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and
-    # add to the lib onnxruntime_providers_cuda separatedly.
+    # added to the lib onnxruntime_providers_cuda separately.
    # onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc.
    set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc)
    list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src})
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@ -779,6 +779,13 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
  onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
  target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
+  if (MSVC)
+    # Cutlass code has an issue with the following:
+    # warning C4100: 'magic': unreferenced formal parameter
+    target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4100>"
+                  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4100>")
+  endif()
+
  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
 endif()

--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@ -53,7 +53,7 @@ struct NodeComputeInfo {
  DestroyFunctionStateFunc release_state_func;
 };

-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;

 enum class DataLayout {
  NCHW,
--- a/include/onnxruntime/core/framework/run_options.h
+++ b/include/onnxruntime/core/framework/run_options.h
@ -45,5 +45,5 @@ struct OrtRunOptions {
 };

 namespace onnxruntime {
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
 }  // namespace onnxruntime
--- a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
+++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
@ -110,8 +110,8 @@ struct BlockwiseQuantization {
  static void prepack_weights(
      int rows,
      int columns,
-      const gsl::span<uint8_t const>& weights,     // <- int4 weights, column major
-      const gsl::span<uint8_t>& weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
+      gsl::span<uint8_t const> weights,     // <- int4 weights, column major
+      gsl::span<uint8_t> weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
  ) {
    ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 &&
                    (rows % QuantBlocking::kRow) == 0 &&
@ -171,10 +171,10 @@ struct BlockwiseQuantization {
  static void prepack_quant_scales(
      size_t rows,
      size_t columns,
-      const gsl::span<ElementT const>& scales,     // <- quant scales, column major layout
-      const gsl::span<ElementT>& scales_prepacked  // <- quant scales prepacked, same size buffer
+      gsl::span<ElementT const> scales,     // <- quant scales, column major layout
+      gsl::span<ElementT> scales_prepacked  // <- quant scales prepacked, same size buffer
  ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
    ORT_ENFORCE(scales.size() == size_t(meta_shape.product()),
                "Quantization scale tensor shape mismatch!");
    ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()),
@ -241,10 +241,10 @@ struct BlockwiseQuantization {
  static void prepack_quant_offsets(
      size_t rows,
      size_t columns,
-      const gsl::span<uint8_t const>& offsets,     // <- quant offsets, int4, column major layout
-      const gsl::span<uint8_t>& offsets_prepacked  // <- quant offsets prepacked, double size buffer
+      gsl::span<uint8_t const> offsets,     // <- quant offsets, int4, column major layout
+      gsl::span<uint8_t> offsets_prepacked  // <- quant offsets prepacked, double size buffer
  ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));

    ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0,
                "Does not support odd number of rows or columns!");
--- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
@ -132,7 +132,7 @@ struct DummyType{
  }

  CUTLASS_HOST_DEVICE
-  std::monostate& operator[](int idx) {
+  std::monostate& operator[](int /*idx */) {
    return dummy_;
  }
 };
--- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
@ -437,7 +437,7 @@ public:

  CUTLASS_HOST_DEVICE
  static void dequant(FragmentScale const &scales,
-                      FragmentOffset const &offsets,
+                      FragmentOffset const &fragment_offsets,
                      Array<uint8_t,kExpandedSize/2> const &weights,
                      Array<ElementScale, kExpandedSize>& dest){
    static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm.");
@ -453,19 +453,18 @@ public:

      uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
      const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
-      const ElementOffset* offsets_ptr = nullptr;
-      if constexpr(kHasOffset) { offsets_ptr = offsets.data(); }
+      [[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr;
+      if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); }

      CUTLASS_PRAGMA_UNROLL
      for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
        // dequantize: d = scale * (weight - offset)
        // to use FMA, d = scale * weight + (scale * (-offset))

-        b64 offsets;
-        if constexpr(kHasOffset){
-          const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets_ptr);
-
+        [[maybe_unused]] b64 offsets{0};
+        if constexpr(kHasOffset) {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          const uint32_t* p = reinterpret_cast<const uint32_t*>(fragment_offsets_ptr);
          asm volatile(
              "{\n\t"
              "  .reg  .b32    rb0, rb1;\n"      // b32 regs for fp16x2 mul operands
@ -486,7 +485,7 @@ public:
          assert(0);
 #endif

-          offsets_ptr += 4;
+          fragment_offsets_ptr += 4;
        } else {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
          asm volatile(
@ -541,7 +540,7 @@ public:
            int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
            ElementScale s = scales[idx];
            if constexpr(kHasOffset){
-              offset = s * static_cast<ElementScale>(-16 - int(offsets[idx]));
+              offset = s * static_cast<ElementScale>(-16 - static_cast<int>(fragment_offsets[idx]));
            } else {
              offset = s * static_cast<ElementScale>(-16-8);
            }
@ -795,13 +794,13 @@ public:
        }
      }
    } else if constexpr (kMmaIterationsB % 2 == 0) {
-      const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
-      uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
-
      if constexpr (kHasOffset){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+        const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
+        uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
        // possible buffer over read 2 bytes here.
        const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets.data());
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
        asm volatile(
          "{\n\t"
          "  .reg  .b32    rb0, rb1, rb2;\n"
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@ -394,14 +394,6 @@ struct ConfigOptions final {
  PROVIDER_DISALLOW_ALL(ConfigOptions)
 };

-struct OrtRunOptions final {
-  const ConfigOptions& GetConfigOptions() const {
-    return g_host->RunOptions__GetConfigOptions(this);
-  }
-
-  PROVIDER_DISALLOW_ALL(OrtRunOptions)
-};
-
 struct ComputeCapability final {
  static std::unique_ptr<ComputeCapability> Create(std::unique_ptr<IndexedSubGraph> t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); }
  static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast<ComputeCapability*>(p)); }
@ -1283,3 +1275,10 @@ template <>
 inline gsl::span<const int64_t> Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); }

 }  // namespace onnxruntime
+
+struct OrtRunOptions final {
+  const onnxruntime::ConfigOptions& GetConfigOptions() const {
+    return onnxruntime::g_host->RunOptions__GetConfigOptions(this);
+  }
+  PROVIDER_DISALLOW_ALL(OrtRunOptions)
+};
--- a/onnxruntime/core/util/matrix_layout.h
+++ b/onnxruntime/core/util/matrix_layout.h
@ -378,7 +378,7 @@ class MatrixRef {
  MatrixRef(
      NonConstMatrixRef const& ref,  ///< MatrixRef to non-const data
      /// SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
-      _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
+      [[maybe_unused]] _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
      ) : data_(ref.data()), shape_(ref.shape()), layout_(Layout::packed(ref.shape())) {}

  ORT_FORCEINLINE
--- a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
+++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
@ -74,7 +74,8 @@ inline void sm80_prepack_quant_scales_ref(
    int columns,
    const MatrixRef<ScaleElementT const, Layout, true>& tensor_scale,
    const MatrixRef<ScaleElementT, Layout, true>& tensor_scale_prepacked) {
-  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn),
+  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] ==
+                                                                             (columns / QuantBlocking::kColumn),
              "Unexpected tensor_scale shape! Expected: (",
              rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")");
  ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape());
@ -84,7 +85,9 @@ inline void sm80_prepack_quant_scales_ref(
  //    2 B operand tiles per mma instruction stacked on k dimension
  //    (1,n) quantization blocking
  if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) {
-    ORT_THROW("sm80_prepack_quant_scales_ref should only be called for row-wise block quantization on 16b float values.");
+    ORT_THROW(
+        "sm80_prepack_quant_scales_ref should only be called for "
+        " row-wise block quantization on 16b float values.");
  }

  // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@ -80,7 +80,8 @@ TEST(TestBeamSearch, TopK) {
  std::vector<float> top_k_values_ref(batch_size * k);
  std::vector<int32_t> top_k_tokens_ref(batch_size * k);
  std::vector<int32_t> top_k_indices_ref(batch_size * k);
-  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k);
+  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size,
+                       beam_size, vocab_size, k);

  const int32_t max_vocab_parts = 128;
  size_t buffer_size = batch_x_beam_x_vocab * 4                                      // input
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@ -14,12 +14,14 @@

 #pragma once

-#include <random>
+#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+
+#include <random>
+#include <thrust/host_vector.h>

-#include "core/util/matrix_layout.h"
 #include "core/common/common.h"
 #include "core/mickey/blk_q4/f16_prepack_sm80.h"
-#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+#include "core/util/matrix_layout.h"

 namespace onnxruntime {
 namespace cuda {
@ -48,10 +50,10 @@ Status sm80_supported();
 template <typename ElementT, int block_size, bool col_blocking, bool has_offsets>
 inline void blkq4_weights_gen(
    int rows, int columns,
-    std::vector<ElementT>& dequants,
-    std::vector<uint8_t>& q_weights,
-    std::vector<ElementT>& q_scales,
-    std::vector<uint8_t>& q_zp) {
+    thrust::host_vector<ElementT>& dequants,
+    thrust::host_vector<uint8_t>& q_weights,
+    thrust::host_vector<ElementT>& q_scales,
+    thrust::host_vector<uint8_t>& q_zp) {
  using Base = onnxruntime::cuda::BlockwiseQuantization<
      ElementT,
      block_size,
@ -74,7 +76,7 @@ inline void blkq4_weights_gen(

  const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
  const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
-  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+  [[maybe_unused]] const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);

  //
  // For testing quantization and dequantization, it is not straight
@ -120,9 +122,9 @@ inline void blkq4_weights_gen(

  q_scales.resize(meta_shape.product());
  for (size_t i = 0; i < q_scales.size(); i++) {
-    uint32_t v = dis(gen);
-    uint32_t m = (v % 63) + 1;
-    uint32_t e = (v >> 6) % 4;
+    uint32_t vl = dis(gen);
+    uint32_t m = (vl % 63) + 1;
+    uint32_t e = (vl >> 6) % 4;
    q_scales[i] = ElementT(m / static_cast<float>(1 << (2 + e)));
  }
  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@ -11,15 +11,15 @@
 *   well with CUTLASS headers.
 */

+#include "blkq4_fp16_gemm_sm80.h"
+
+#include "gtest/gtest.h"
+#include <thrust/host_vector.h>
 #include <random>

 #include "core/framework/float16.h"
 #include "core/mlas/inc/mlas_q4.h"

-#include "blkq4_fp16_gemm_sm80.h"
-
-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace test {

@ -43,10 +43,10 @@ void testPrepack(int rows, int columns) {
  const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);

-  std::vector<ElementW> q_weights;
-  std::vector<ElementT> q_scales;
-  std::vector<ElementQOffset> q_zp;
-  std::vector<ElementT> dequants;
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementT> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementT> dequants;
  onnxruntime::cuda::test::blkq4_weights_gen<ElementT, block_size, col_blocking, has_offset>(
      rows, columns, dequants, q_weights, q_scales, q_zp);

--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@ -11,9 +11,11 @@
 *   well with gtest headers.
 */

+#include "blkq4_fp16_gemm_sm80.h"
+
 #include <random>
-#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>

 #include "core/mickey/blk_q4/f16_gemm_sm80.h"

@ -26,13 +28,11 @@

 #include "core/common/common.h"

-#include "blkq4_fp16_gemm_sm80.h"
-
 namespace onnxruntime {
-namespace cuda{
-namespace test{
+namespace cuda {
+namespace test {

-Status sm80_supported(){
+Status sm80_supported() {
  cudaDeviceProp props;

  cudaError_t error = cudaGetDeviceProperties(&props, 0);
@ -55,27 +55,25 @@ Status sm80_supported(){
 *        Copied directly from cutlass util/reference/device/gemm.h
 *        for the strange reason that compiler insists on asking
 *        for explicit stream argument in kernel launch.
-*/
+ */
 template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType
->
+    typename ElementA,
+    typename LayoutA,
+    typename ElementB,
+    typename LayoutB,
+    typename ElementC,
+    typename LayoutC,
+    typename ScalarType,
+    typename AccumulatorType>
 void compute_gemm_ref(
-  cutlass::gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  cutlass::TensorRef<ElementA, LayoutA> tensor_a,
-  cutlass::TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  cutlass::TensorRef<ElementC, LayoutC> tensor_c,
-  cutlass::TensorRef<ElementC, LayoutC> tensor_d,
-  AccumulatorType initial_accum = AccumulatorType(0)) {
-
+    cutlass::gemm::GemmCoord problem_size,
+    ScalarType alpha,
+    cutlass::TensorRef<ElementA, LayoutA> tensor_a,
+    cutlass::TensorRef<ElementB, LayoutB> tensor_b,
+    ScalarType beta,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_c,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_d,
+    AccumulatorType initial_accum = AccumulatorType(0)) {
  // Blocking structure potentially improves performance of reference implementation
  // with a minor increase in complexity.
  //
@ -85,30 +83,27 @@ void compute_gemm_ref(
  dim3 block(16, 8);

  dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
-  );
+      (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+      (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn));

  // Launch a GEMM kernel
  cutlass::reference::device::kernel::Gemm<
-    cutlass::TensorRef<ElementA, LayoutA>,
-    cutlass::TensorRef<ElementB, LayoutB>,
-    cutlass::TensorRef<ElementC, LayoutC>,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    cutlass::multiply_add<AccumulatorType>,
-    cutlass::NumericConverter<ElementC, ScalarType>
-  ><<<grid, block, 0, 0>>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
+      cutlass::TensorRef<ElementA, LayoutA>,
+      cutlass::TensorRef<ElementB, LayoutB>,
+      cutlass::TensorRef<ElementC, LayoutC>,
+      ScalarType,
+      AccumulatorType,
+      OutputTile,
+      cutlass::multiply_add<AccumulatorType>,
+      cutlass::NumericConverter<ElementC, ScalarType>><<<grid, block, 0, 0>>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      tensor_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////

@ -117,28 +112,31 @@ void compute_gemm_ref(
 //

 template <
-  typename Element,
-  typename LayoutCutlass,
-  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
-  >
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass,
+                                                      cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
 __forceinline__
-MatrixRef<Element, Layout, true> make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
-  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
-                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+    MatrixRef<Element, Layout, true>
+    make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
  auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
-  auto* ptr = const_cast<typename std::remove_const<Element>::type *>(tensor.host_data());
+  auto* ptr = const_cast<typename std::remove_const<Element>::type*>(tensor.host_data());
  return MatrixRef<Element, Layout, true>(ptr, tensor.capacity(), shape);
 }

 template <
-  typename Element,
-  typename LayoutCutlass,
-  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
-  >
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
 __forceinline__
-MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
-  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
-                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+    MatrixRef<Element const, Layout, true>
+    make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
  auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
  return MatrixRef<Element const, Layout, true>(tensor.host_data(), tensor.capacity(), shape);
 }
@ -147,7 +145,7 @@ MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<E
 // Invoking the kernel
 //

-template<
+template <
    int block_size,
    bool column_wise_blocking,
    bool small_m,
@ -160,9 +158,9 @@ void run_blkq4_gemm(int m, int n, int k) {

  using ElementDequant = cutlass::half_t;
  using QuantBlocking =
-    typename std::conditional<column_wise_blocking,
-                     cutlass::MatrixShape<block_size, 1>,
-                     cutlass::MatrixShape<1, block_size>>::type;
+      typename std::conditional<column_wise_blocking,
+                                cutlass::MatrixShape<block_size, 1>,
+                                cutlass::MatrixShape<1, block_size>>::type;

  using GemmRunner = BlkQ4F16GemmImpl<ElementDequant, QuantBlocking, small_m, has_offsets>;

@ -181,17 +179,18 @@ void run_blkq4_gemm(int m, int n, int k) {
  using LayoutInputQScale = typename GemmRunner::LayoutInputQScale;

  const cutlass::gemm::GemmCoord problem_size = {m, n, k};
-  const auto q_weight_shape = cutlass::make_Coord(problem_size.k()/2, problem_size.n());
-  const auto meta_shape = cutlass::make_Coord(problem_size.k()/QuantBlocking::kRow, problem_size.n()/QuantBlocking::kColumn);
+  const auto q_weight_shape = cutlass::make_Coord(problem_size.k() / 2, problem_size.n());
+  const auto meta_shape = cutlass::make_Coord(problem_size.k() / QuantBlocking::kRow, problem_size.n() /
+                                                                                          QuantBlocking::kColumn);

  //
  // Generate quantized and dequantizeed input matrix B [K, N]
  //
  static_assert(std::is_same<LayoutInputWPack, cutlass::layout::ColumnMajor>::value);
-  std::vector<ElementW> q_weights;
-  std::vector<ElementQScale> q_scales;
-  std::vector<ElementQOffset> q_zp;
-  std::vector<ElementDequant> dequants;
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementQScale> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementDequant> dequants;
  onnxruntime::cuda::test::blkq4_weights_gen<ElementDequant, block_size, column_wise_blocking, has_offsets>(
      problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp);

@ -201,11 +200,11 @@ void run_blkq4_gemm(int m, int n, int k) {
      4,
      column_wise_blocking>;

-  std::vector<ElementW> packed_w(q_weight_shape.product());
+  thrust::host_vector<ElementW> packed_w(q_weight_shape.product());
  PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w);
-  std::vector<ElementQScale> packed_scales(meta_shape.product());
+  thrust::host_vector<ElementQScale> packed_scales(meta_shape.product());
  PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales);
-  std::vector<ElementQOffset> packed_zp;
+  thrust::host_vector<ElementQOffset> packed_zp;
  if constexpr (has_offsets) {
    packed_zp.resize(meta_shape.product());
    PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp);
@ -240,16 +239,16 @@ void run_blkq4_gemm(int m, int n, int k) {
  //
  thrust::device_vector<ElementW> d_packed_w(packed_w);
  cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_W(
-    reinterpret_cast<ElementWPack const *>(d_packed_w.data().get()),
-    LayoutInputWPack::packed({problem_size.k()/2, problem_size.n()/2}));
+      reinterpret_cast<ElementWPack const*>(d_packed_w.data().get()),
+      LayoutInputWPack::packed({problem_size.k() / 2, problem_size.n() / 2}));

  thrust::device_vector<ElementQScale> d_packed_scales(packed_scales);
  cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_scales(
-    d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
+      d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));

  thrust::device_vector<ElementQOffset> d_packed_zp(packed_zp);
  cutlass::TensorRef<ElementQOffset const, LayoutInputQScale> ref_zp(
-    d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
+      d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));

  tensor_a.sync_device();
  tensor_c.sync_device();
@ -257,16 +256,16 @@ void run_blkq4_gemm(int m, int n, int k) {

  // run GEMM
  cutlass::Status status;
-  if constexpr (has_offsets){
+  if constexpr (has_offsets) {
    status = GemmRunner::run(
-      nullptr, problem_size, tensor_a.device_ref(), ref_W,
-      ref_scales, ref_zp,
-      tensor_c.device_ref(), tensor_d.device_ref());
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales, ref_zp,
+        tensor_c.device_ref(), tensor_d.device_ref());
  } else {
    status = GemmRunner::run(
-      nullptr, problem_size, tensor_a.device_ref(), ref_W,
-      ref_scales,
-      tensor_c.device_ref(), tensor_d.device_ref());
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales,
+        tensor_c.device_ref(), tensor_d.device_ref());
  }
  ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status));

@ -275,7 +274,7 @@ void run_blkq4_gemm(int m, int n, int k) {
  using LayoutInputB = cutlass::layout::ColumnMajor;
  thrust::device_vector<ElementInputB> d_dequants(dequants);
  cutlass::TensorRef<ElementInputB, LayoutInputB> ref_B(
-    d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
+      d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
                           // reference kernel
@ -289,9 +288,9 @@ void run_blkq4_gemm(int m, int n, int k) {
  ElementComputeEpilogue beta = ElementComputeEpilogue(0);

  compute_gemm_ref<ElementInputA, LayoutInputA,
-               ElementInputB, LayoutInputB,
-               ElementOutput, LayoutOutput,
-               ElementComputeEpilogue, ElementAccumulator>(
+                   ElementInputB, LayoutInputB,
+                   ElementOutput, LayoutOutput,
+                   ElementComputeEpilogue, ElementAccumulator>(
      problem_size,
      alpha,
      tensor_a.device_ref(),
@ -300,17 +299,17 @@ void run_blkq4_gemm(int m, int n, int k) {
      tensor_c.device_ref(),
      tensor_ref_d.device_ref());

-  // Wait for kernels to finish
+  //// Wait for kernels to finish
  cudaDeviceSynchronize();

-  // Copy output data from CUTLASS and reference kernel to host for comparison
+  //// Copy output data from CUTLASS and reference kernel to host for comparison
  tensor_d.sync_host();
  tensor_ref_d.sync_host();

-  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  //// Check if output from CUTLASS kernel and reference kernel are equal or not
  bool passed = cutlass::reference::host::TensorEquals(
-    tensor_d.host_view(),
-    tensor_ref_d.host_view());
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
  ORT_ENFORCE(passed, "Gemm kernel result wrong!");
 }

--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@ -5,11 +5,14 @@
 // extra code in the core of CUDA EP and that code may
 //  1. slow down performance critical applications and
 //  2. increase binary size of ORT.
-#include <iostream>
-#include "core/providers/cuda/cuda_execution_provider.h"
-#include "core/providers/cuda/cuda_allocator.h"
-#include "core/providers/cuda/cuda_stream_handle.h"
+
 #include "gtest/gtest.h"
+#include <iostream>
+
+#include "core/framework/run_options.h"
+#include "core/providers/cuda/cuda_allocator.h"
+#include "core/providers/cuda/cuda_execution_provider.h"
+#include "core/providers/cuda/cuda_stream_handle.h"

 namespace onnxruntime {
 namespace cuda {
@ -22,7 +25,7 @@ TEST(TestDeferredRelease, WithArena) {
  CUDAExecutionProvider ep(info);
  AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0];

-  RunOptions run_opts;
+  onnxruntime::RunOptions run_opts;
  run_opts.run_tag = "log1";
  // Allocator for call cudaMallocHost and cudaFreeHost
  // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
@ -54,7 +57,7 @@ TEST(TestDeferredRelease, WithoutArena) {
  CUDAExecutionProviderInfo info;
  CUDAExecutionProvider ep(info);

-  RunOptions run_opts;
+  onnxruntime::RunOptions run_opts;
  run_opts.run_tag = "log1";

  OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
@ -6,12 +6,11 @@
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cuda/cuda_provider_options.h"

+#include "gtest/gtest.h"
 #include <memory>
 #include <chrono>

 #include "core/common/gsl.h"
-#include "gtest/gtest.h"
-
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_execution_provider_info.h"
 #include "core/providers/cuda/cuda_allocator.h"
@ -64,8 +63,15 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {

  void cuda__Impl_Cast(void*, const float*, double*, size_t) override {}

-  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
-  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
+  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode,
+                        const char* msg, const char* file, const int line) override {
+    return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName,
+                                      cudaError(successCode), msg, file, line);
+  }
+  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode,
+                     const char* msg, const char* file, const int line) override {
+    CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line);
+  }

  void CopyGpuToCpu(void*, const void*, const size_t, const OrtMemoryInfo&, const OrtMemoryInfo&) override {}

@ -93,19 +99,27 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
    return nullptr;
  }

-  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, const OrtArenaCfg*) override {
+  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy,
+                                                  onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&,
+                                                  const OrtArenaCfg*) override {
    return nullptr;
  }

  void TestAll() override {
-    // TestAll is the entry point of CUDA EP's insternal tests.
+    // TestAll is the entry point of CUDA EP's internal tests.
    // Those internal tests are not directly callable from onnxruntime_test_all
    // because CUDA EP is a shared library now.
    // Instead, this is a test provider that implements all the test cases.
    // onnxruntime_test_all is calling this function through TryGetProviderInfo_CUDA_Test.
-    int argc = 1;
-    std::string mock_exe_name = "onnxruntime_providers_cuda_ut";
-    char* argv[] = {const_cast<char*>(mock_exe_name.data())};
+    char mock_exe_name[] = "onnxruntime_providers_cuda_ut";
+
+    // InitGoogleTest decrements argc and removes args from argv if
+    // recognized. By doing so it decrements argc and shifts argv,
+    // to do so, from the code comments it expects argc + 1 with the last one always being nullptr
+    // otherwise, windows diagnostics reports stack corruption. when
+    int argc = 1;  // Change argc to 2 and edit the filter below if necessary
+    char* argv[] = {mock_exe_name, nullptr};
+    // char* argv[] = {mock_exe_name, "--gtest_filter=ReductionFunctionsTest.*", nullptr};
    ::testing::InitGoogleTest(&argc, argv);
    ORT_ENFORCE(RUN_ALL_TESTS() == 0);
  }
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

+#include "gtest/gtest.h"
+
 #include <memory>
 #include <vector>

-#include "gtest/gtest.h"
-
 #include "core/common/common.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
@ -32,7 +32,8 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
  Fill<TElement>(nullptr, buffer.get(), value, num_elements);

  auto cpu_buffer = std::make_unique<TElement[]>(num_elements);
-  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost));
+  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement),
+                             cudaMemcpyKind::cudaMemcpyDeviceToHost));

  std::vector<TElement> expected_data(num_elements, value);
  EXPECT_EQ(std::memcmp(cpu_buffer.get(), expected_data.data(), num_elements * sizeof(TElement)), 0);
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

+#include "gtest/gtest.h"
+
 #include "core/common/common.h"
 #include "core/providers/cuda/cuda_common.h"

-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace cuda {
 namespace test {
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-#include <memory>
-
 #include "gtest/gtest.h"

+#include <memory>
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/common/optional.h"
 #include "core/providers/cuda/reduction/reduction_functions.h"
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@ -150,7 +150,9 @@ stages:
              --enable_cuda_profiling --enable_cuda_nhwc_ops \
              --enable_pybind --build_java \
              --use_cache \
-              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
+              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75 \
+              --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \
+              --cmake_extra_defines  onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \
                ccache -sv; \
                ccache -z"
      workingDirectory: $(Build.SourcesDirectory)
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@ -42,7 +42,12 @@ stages:
        BuildConfig: 'RelWithDebInfo'
        EnvSetupScript: setup_env_cuda.bat
        buildArch: x64
-        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_cuda_profiling
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
        msbuildPlatform: x64
        isX86: false
        job_name_suffix: x64_RelWithDebInfo
@ -59,7 +64,10 @@ stages:
        BuildConfig: 'RelWithDebInfo'
        EnvSetupScript: setup_env_cuda.bat
        buildArch: x64
-        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --skip_onnx_tests
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
        msbuildPlatform: x64
        isX86: false
        job_name_suffix: x64_RelWithDebInfo
@ -95,7 +103,11 @@ stages:
        EnvSetupScript: setup_env_cuda.bat
        buildArch: x64
        # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
-        additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
+        additionalBuildFlags: >-
+          --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
+          --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
        msbuildPlatform: x64
        isX86: false
        job_name_suffix: x64_RelWithDebInfo