From b95fd4e644775a4343c13435bd729bd64f411752 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 27 Mar 2024 13:32:36 -0700 Subject: [PATCH] Enable CUDA EP unit testing on Windows (#20039) ### Description Address build issues and source code discrepancies. Fix cuda_test_provider gtest argument stack corruption. ### Motivation and Context `OpTester` class that is widely used for kernel testing is not suitable for testing internal classes for EPs that are built as shared objects. Currently, CUDA EP tests run only on Linux. We want to enable testing and developments on Windows, and create a usable pattern for testing of other EPs internals. Alternatives considered: Abstracting EP unit tests into separate test executable such as `onnxruntime_test_all`. This alternative was rejected as it would create a lot more changes in the established patterns, and potentially interfere with CUDA functionality with more complex source code maintanence. --- cmake/CMakeLists.txt | 2 +- cmake/onnxruntime_providers_cuda.cmake | 2 +- cmake/onnxruntime_unittests.cmake | 7 + .../core/framework/execution_provider.h | 2 +- .../onnxruntime/core/framework/run_options.h | 2 +- .../core/mickey/blk_q4/f16_prepack_sm80.h | 16 +- .../threadblock/quantb_mma_multistage.h | 2 +- .../quantb_meta_mma_tensor_op_tile_iterator.h | 25 ++- .../shared_library/provider_wrappedtypes.h | 15 +- onnxruntime/core/util/matrix_layout.h | 2 +- .../test/cuda_host/blkq4_fp16_quant_sm80.h | 7 +- .../cuda/test_cases/beam_search_topk.cc | 3 +- .../cuda/test_cases/blkq4_fp16_gemm_sm80.h | 24 +-- .../test_cases/blkq4_fp16_gemm_sm80_test.cc | 16 +- .../test_cases/blkq4_fp16_gemm_sm80_testcu.cu | 187 +++++++++--------- .../cuda_execution_provider_test.cc | 15 +- .../cuda/test_cases/cuda_test_provider.cc | 32 ++- .../cuda/test_cases/cuda_utils_test.cc | 7 +- .../cuda/test_cases/gemm_options_test.cc | 4 +- .../test_cases/reduction_functions_test.cc | 4 +- .../azure-pipelines/linux-gpu-ci-pipeline.yml | 4 +- .../azure-pipelines/win-gpu-ci-pipeline.yml | 18 +- 22 files changed, 219 insertions(+), 177 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index ee1959bb35..3293506141 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -76,7 +76,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead. -cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF) +cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF) option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF) option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index aeeac10ead..1346a9ce96 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -122,7 +122,7 @@ endif() if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS) # cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and - # add to the lib onnxruntime_providers_cuda separatedly. + # added to the lib onnxruntime_providers_cuda separately. # onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc. set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc) list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src}) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 1ffb838328..4a351dcf90 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -779,6 +779,13 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS) onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock) target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey) target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common) + if (MSVC) + # Cutlass code has an issue with the following: + # warning C4100: 'magic': unreferenced formal parameter + target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$:SHELL:--compiler-options /wd4100>" + "$<$>:/wd4100>") + endif() + list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut) endif() diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index 40ca96a19a..16ad943a5f 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -53,7 +53,7 @@ struct NodeComputeInfo { DestroyFunctionStateFunc release_state_func; }; -using RunOptions = OrtRunOptions; +using RunOptions = ::OrtRunOptions; enum class DataLayout { NCHW, diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h index 5444c825d7..789c3b13f2 100644 --- a/include/onnxruntime/core/framework/run_options.h +++ b/include/onnxruntime/core/framework/run_options.h @@ -45,5 +45,5 @@ struct OrtRunOptions { }; namespace onnxruntime { -using RunOptions = OrtRunOptions; +using RunOptions = ::OrtRunOptions; } // namespace onnxruntime diff --git a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h index a08cfb97ee..c81b4967d2 100644 --- a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h +++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h @@ -110,8 +110,8 @@ struct BlockwiseQuantization { static void prepack_weights( int rows, int columns, - const gsl::span& weights, // <- int4 weights, column major - const gsl::span& weights_prepacked // <- int4 prepacked weights tensor, same size buffer + gsl::span weights, // <- int4 weights, column major + gsl::span weights_prepacked // <- int4 prepacked weights tensor, same size buffer ) { ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 && (rows % QuantBlocking::kRow) == 0 && @@ -171,10 +171,10 @@ struct BlockwiseQuantization { static void prepack_quant_scales( size_t rows, size_t columns, - const gsl::span& scales, // <- quant scales, column major layout - const gsl::span& scales_prepacked // <- quant scales prepacked, same size buffer + gsl::span scales, // <- quant scales, column major layout + gsl::span scales_prepacked // <- quant scales prepacked, same size buffer ) { - auto meta_shape = get_quant_meta_shape(rows, columns); + auto meta_shape = get_quant_meta_shape(static_cast(rows), static_cast(columns)); ORT_ENFORCE(scales.size() == size_t(meta_shape.product()), "Quantization scale tensor shape mismatch!"); ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()), @@ -241,10 +241,10 @@ struct BlockwiseQuantization { static void prepack_quant_offsets( size_t rows, size_t columns, - const gsl::span& offsets, // <- quant offsets, int4, column major layout - const gsl::span& offsets_prepacked // <- quant offsets prepacked, double size buffer + gsl::span offsets, // <- quant offsets, int4, column major layout + gsl::span offsets_prepacked // <- quant offsets prepacked, double size buffer ) { - auto meta_shape = get_quant_meta_shape(rows, columns); + auto meta_shape = get_quant_meta_shape(static_cast(rows), static_cast(columns)); ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0, "Does not support odd number of rows or columns!"); diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h index 8b6bac8c50..28364cc34f 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h @@ -132,7 +132,7 @@ struct DummyType{ } CUTLASS_HOST_DEVICE - std::monostate& operator[](int idx) { + std::monostate& operator[](int /*idx */) { return dummy_; } }; diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h index 4ba39dda3d..26239161cf 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h @@ -437,7 +437,7 @@ public: CUTLASS_HOST_DEVICE static void dequant(FragmentScale const &scales, - FragmentOffset const &offsets, + FragmentOffset const &fragment_offsets, Array const &weights, Array& dest){ static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm."); @@ -453,19 +453,18 @@ public: uint32_t* dest_pair = reinterpret_cast(dest.data()); const b64* scales_ptr = reinterpret_cast(scales.data()); - const ElementOffset* offsets_ptr = nullptr; - if constexpr(kHasOffset) { offsets_ptr = offsets.data(); } + [[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr; + if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); } CUTLASS_PRAGMA_UNROLL for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){ // dequantize: d = scale * (weight - offset) // to use FMA, d = scale * weight + (scale * (-offset)) - b64 offsets; - if constexpr(kHasOffset){ - const uint32_t* p = reinterpret_cast(offsets_ptr); - + [[maybe_unused]] b64 offsets{0}; + if constexpr(kHasOffset) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + const uint32_t* p = reinterpret_cast(fragment_offsets_ptr); asm volatile( "{\n\t" " .reg .b32 rb0, rb1;\n" // b32 regs for fp16x2 mul operands @@ -486,7 +485,7 @@ public: assert(0); #endif - offsets_ptr += 4; + fragment_offsets_ptr += 4; } else { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) asm volatile( @@ -541,7 +540,7 @@ public: int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma; ElementScale s = scales[idx]; if constexpr(kHasOffset){ - offset = s * static_cast(-16 - int(offsets[idx])); + offset = s * static_cast(-16 - static_cast(fragment_offsets[idx])); } else { offset = s * static_cast(-16-8); } @@ -795,13 +794,13 @@ public: } } } else if constexpr (kMmaIterationsB % 2 == 0) { - const uint32_t* scales_ptr = reinterpret_cast(scales.data()); - uint32_t* addon_ptr = reinterpret_cast(addon); - if constexpr (kHasOffset){ +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + const uint32_t* scales_ptr = reinterpret_cast(scales.data()); + uint32_t* addon_ptr = reinterpret_cast(addon); // possible buffer over read 2 bytes here. const uint32_t* p = reinterpret_cast(offsets.data()); -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + asm volatile( "{\n\t" " .reg .b32 rb0, rb1, rb2;\n" diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index bdad18c7ed..3bb938c1a3 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -394,14 +394,6 @@ struct ConfigOptions final { PROVIDER_DISALLOW_ALL(ConfigOptions) }; -struct OrtRunOptions final { - const ConfigOptions& GetConfigOptions() const { - return g_host->RunOptions__GetConfigOptions(this); - } - - PROVIDER_DISALLOW_ALL(OrtRunOptions) -}; - struct ComputeCapability final { static std::unique_ptr Create(std::unique_ptr t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); } static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast(p)); } @@ -1283,3 +1275,10 @@ template <> inline gsl::span Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); } } // namespace onnxruntime + +struct OrtRunOptions final { + const onnxruntime::ConfigOptions& GetConfigOptions() const { + return onnxruntime::g_host->RunOptions__GetConfigOptions(this); + } + PROVIDER_DISALLOW_ALL(OrtRunOptions) +}; diff --git a/onnxruntime/core/util/matrix_layout.h b/onnxruntime/core/util/matrix_layout.h index 783a29d8a2..43843da3fb 100644 --- a/onnxruntime/core/util/matrix_layout.h +++ b/onnxruntime/core/util/matrix_layout.h @@ -378,7 +378,7 @@ class MatrixRef { MatrixRef( NonConstMatrixRef const& ref, ///< MatrixRef to non-const data /// SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const - _Magic magic = (typename std::enable_if::type)0 + [[maybe_unused]] _Magic magic = (typename std::enable_if::type)0 ) : data_(ref.data()), shape_(ref.shape()), layout_(Layout::packed(ref.shape())) {} ORT_FORCEINLINE diff --git a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h index 6ea8b55505..942b1c4d2c 100644 --- a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h +++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h @@ -74,7 +74,8 @@ inline void sm80_prepack_quant_scales_ref( int columns, const MatrixRef& tensor_scale, const MatrixRef& tensor_scale_prepacked) { - ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn), + ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == + (columns / QuantBlocking::kColumn), "Unexpected tensor_scale shape! Expected: (", rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")"); ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape()); @@ -84,7 +85,9 @@ inline void sm80_prepack_quant_scales_ref( // 2 B operand tiles per mma instruction stacked on k dimension // (1,n) quantization blocking if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) { - ORT_THROW("sm80_prepack_quant_scales_ref should only be called for row-wise block quantization on 16b float values."); + ORT_THROW( + "sm80_prepack_quant_scales_ref should only be called for " + " row-wise block quantization on 16b float values."); } // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc index 9fecec9f7e..a0d115c41c 100644 --- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc +++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc @@ -80,7 +80,8 @@ TEST(TestBeamSearch, TopK) { std::vector top_k_values_ref(batch_size * k); std::vector top_k_tokens_ref(batch_size * k); std::vector top_k_indices_ref(batch_size * k); - ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k); + ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, + beam_size, vocab_size, k); const int32_t max_vocab_parts = 128; size_t buffer_size = batch_x_beam_x_vocab * 4 // input diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h index bbe370675f..f0dfaf1a58 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h @@ -14,12 +14,14 @@ #pragma once -#include +#include "test/cuda_host/blkq4_fp16_quant_sm80.h" + +#include +#include -#include "core/util/matrix_layout.h" #include "core/common/common.h" #include "core/mickey/blk_q4/f16_prepack_sm80.h" -#include "test/cuda_host/blkq4_fp16_quant_sm80.h" +#include "core/util/matrix_layout.h" namespace onnxruntime { namespace cuda { @@ -48,10 +50,10 @@ Status sm80_supported(); template inline void blkq4_weights_gen( int rows, int columns, - std::vector& dequants, - std::vector& q_weights, - std::vector& q_scales, - std::vector& q_zp) { + thrust::host_vector& dequants, + thrust::host_vector& q_weights, + thrust::host_vector& q_scales, + thrust::host_vector& q_zp) { using Base = onnxruntime::cuda::BlockwiseQuantization< ElementT, block_size, @@ -74,7 +76,7 @@ inline void blkq4_weights_gen( const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns); const auto meta_shape = Base::get_quant_meta_shape(rows, columns); - const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); + [[maybe_unused]] const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); // // For testing quantization and dequantization, it is not straight @@ -120,9 +122,9 @@ inline void blkq4_weights_gen( q_scales.resize(meta_shape.product()); for (size_t i = 0; i < q_scales.size(); i++) { - uint32_t v = dis(gen); - uint32_t m = (v % 63) + 1; - uint32_t e = (v >> 6) % 4; + uint32_t vl = dis(gen); + uint32_t m = (vl % 63) + 1; + uint32_t e = (vl >> 6) % 4; q_scales[i] = ElementT(m / static_cast(1 << (2 + e))); } MatrixRef tensor_scale( diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc index e687ae73e6..e7fa0dae02 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc @@ -11,15 +11,15 @@ * well with CUTLASS headers. */ +#include "blkq4_fp16_gemm_sm80.h" + +#include "gtest/gtest.h" +#include #include #include "core/framework/float16.h" #include "core/mlas/inc/mlas_q4.h" -#include "blkq4_fp16_gemm_sm80.h" - -#include "gtest/gtest.h" - namespace onnxruntime { namespace test { @@ -43,10 +43,10 @@ void testPrepack(int rows, int columns) { const auto meta_shape = Base::get_quant_meta_shape(rows, columns); const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); - std::vector q_weights; - std::vector q_scales; - std::vector q_zp; - std::vector dequants; + thrust::host_vector q_weights; + thrust::host_vector q_scales; + thrust::host_vector q_zp; + thrust::host_vector dequants; onnxruntime::cuda::test::blkq4_weights_gen( rows, columns, dequants, q_weights, q_scales, q_zp); diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu index 69c929d446..210c33933d 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu @@ -11,9 +11,11 @@ * well with gtest headers. */ +#include "blkq4_fp16_gemm_sm80.h" + #include -#include #include +#include #include "core/mickey/blk_q4/f16_gemm_sm80.h" @@ -26,13 +28,11 @@ #include "core/common/common.h" -#include "blkq4_fp16_gemm_sm80.h" - namespace onnxruntime { -namespace cuda{ -namespace test{ +namespace cuda { +namespace test { -Status sm80_supported(){ +Status sm80_supported() { cudaDeviceProp props; cudaError_t error = cudaGetDeviceProperties(&props, 0); @@ -55,27 +55,25 @@ Status sm80_supported(){ * Copied directly from cutlass util/reference/device/gemm.h * for the strange reason that compiler insists on asking * for explicit stream argument in kernel launch. -*/ + */ template < - typename ElementA, - typename LayoutA, - typename ElementB, - typename LayoutB, - typename ElementC, - typename LayoutC, - typename ScalarType, - typename AccumulatorType -> + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ScalarType, + typename AccumulatorType> void compute_gemm_ref( - cutlass::gemm::GemmCoord problem_size, - ScalarType alpha, - cutlass::TensorRef tensor_a, - cutlass::TensorRef tensor_b, - ScalarType beta, - cutlass::TensorRef tensor_c, - cutlass::TensorRef tensor_d, - AccumulatorType initial_accum = AccumulatorType(0)) { - + cutlass::gemm::GemmCoord problem_size, + ScalarType alpha, + cutlass::TensorRef tensor_a, + cutlass::TensorRef tensor_b, + ScalarType beta, + cutlass::TensorRef tensor_c, + cutlass::TensorRef tensor_d, + AccumulatorType initial_accum = AccumulatorType(0)) { // Blocking structure potentially improves performance of reference implementation // with a minor increase in complexity. // @@ -85,30 +83,27 @@ void compute_gemm_ref( dim3 block(16, 8); dim3 grid( - (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow), - (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn) - ); + (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow), + (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)); // Launch a GEMM kernel cutlass::reference::device::kernel::Gemm< - cutlass::TensorRef, - cutlass::TensorRef, - cutlass::TensorRef, - ScalarType, - AccumulatorType, - OutputTile, - cutlass::multiply_add, - cutlass::NumericConverter - ><<>>( - problem_size, - alpha, - tensor_a, - tensor_b, - beta, - tensor_c, - tensor_d, - initial_accum - ); + cutlass::TensorRef, + cutlass::TensorRef, + cutlass::TensorRef, + ScalarType, + AccumulatorType, + OutputTile, + cutlass::multiply_add, + cutlass::NumericConverter><<>>( + problem_size, + alpha, + tensor_a, + tensor_b, + beta, + tensor_c, + tensor_d, + initial_accum); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -117,28 +112,31 @@ void compute_gemm_ref( // template < - typename Element, - typename LayoutCutlass, - typename Layout = std::conditional_t::value, ColumnMajorLayout, RowMajorLayout> - > + typename Element, + typename LayoutCutlass, + typename Layout = std::conditional_t::value, + ColumnMajorLayout, RowMajorLayout>> __forceinline__ -MatrixRef make_MatrixRef(cutlass::HostTensor const& tensor) { - static_assert(std::is_same::value - || std::is_same::value); + MatrixRef + make_MatrixRef(cutlass::HostTensor const& tensor) { + static_assert(std::is_same::value || + std::is_same::value); auto shape = make_Position(tensor.extent().row(), tensor.extent().column()); - auto* ptr = const_cast::type *>(tensor.host_data()); + auto* ptr = const_cast::type*>(tensor.host_data()); return MatrixRef(ptr, tensor.capacity(), shape); } template < - typename Element, - typename LayoutCutlass, - typename Layout = std::conditional_t::value, ColumnMajorLayout, RowMajorLayout> - > + typename Element, + typename LayoutCutlass, + typename Layout = std::conditional_t::value, + ColumnMajorLayout, RowMajorLayout>> __forceinline__ -MatrixRef make_ConstMatrixRef(cutlass::HostTensor const& tensor) { - static_assert(std::is_same::value - || std::is_same::value); + MatrixRef + make_ConstMatrixRef(cutlass::HostTensor const& tensor) { + static_assert(std::is_same::value || + std::is_same::value); auto shape = make_Position(tensor.extent().row(), tensor.extent().column()); return MatrixRef(tensor.host_data(), tensor.capacity(), shape); } @@ -147,7 +145,7 @@ MatrixRef make_ConstMatrixRef(cutlass::HostTensor, - cutlass::MatrixShape<1, block_size>>::type; + typename std::conditional, + cutlass::MatrixShape<1, block_size>>::type; using GemmRunner = BlkQ4F16GemmImpl; @@ -181,17 +179,18 @@ void run_blkq4_gemm(int m, int n, int k) { using LayoutInputQScale = typename GemmRunner::LayoutInputQScale; const cutlass::gemm::GemmCoord problem_size = {m, n, k}; - const auto q_weight_shape = cutlass::make_Coord(problem_size.k()/2, problem_size.n()); - const auto meta_shape = cutlass::make_Coord(problem_size.k()/QuantBlocking::kRow, problem_size.n()/QuantBlocking::kColumn); + const auto q_weight_shape = cutlass::make_Coord(problem_size.k() / 2, problem_size.n()); + const auto meta_shape = cutlass::make_Coord(problem_size.k() / QuantBlocking::kRow, problem_size.n() / + QuantBlocking::kColumn); // // Generate quantized and dequantizeed input matrix B [K, N] // static_assert(std::is_same::value); - std::vector q_weights; - std::vector q_scales; - std::vector q_zp; - std::vector dequants; + thrust::host_vector q_weights; + thrust::host_vector q_scales; + thrust::host_vector q_zp; + thrust::host_vector dequants; onnxruntime::cuda::test::blkq4_weights_gen( problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp); @@ -201,11 +200,11 @@ void run_blkq4_gemm(int m, int n, int k) { 4, column_wise_blocking>; - std::vector packed_w(q_weight_shape.product()); + thrust::host_vector packed_w(q_weight_shape.product()); PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w); - std::vector packed_scales(meta_shape.product()); + thrust::host_vector packed_scales(meta_shape.product()); PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales); - std::vector packed_zp; + thrust::host_vector packed_zp; if constexpr (has_offsets) { packed_zp.resize(meta_shape.product()); PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp); @@ -240,16 +239,16 @@ void run_blkq4_gemm(int m, int n, int k) { // thrust::device_vector d_packed_w(packed_w); cutlass::TensorRef ref_W( - reinterpret_cast(d_packed_w.data().get()), - LayoutInputWPack::packed({problem_size.k()/2, problem_size.n()/2})); + reinterpret_cast(d_packed_w.data().get()), + LayoutInputWPack::packed({problem_size.k() / 2, problem_size.n() / 2})); thrust::device_vector d_packed_scales(packed_scales); cutlass::TensorRef ref_scales( - d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape)); + d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape)); thrust::device_vector d_packed_zp(packed_zp); cutlass::TensorRef ref_zp( - d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape)); + d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape)); tensor_a.sync_device(); tensor_c.sync_device(); @@ -257,16 +256,16 @@ void run_blkq4_gemm(int m, int n, int k) { // run GEMM cutlass::Status status; - if constexpr (has_offsets){ + if constexpr (has_offsets) { status = GemmRunner::run( - nullptr, problem_size, tensor_a.device_ref(), ref_W, - ref_scales, ref_zp, - tensor_c.device_ref(), tensor_d.device_ref()); + nullptr, problem_size, tensor_a.device_ref(), ref_W, + ref_scales, ref_zp, + tensor_c.device_ref(), tensor_d.device_ref()); } else { status = GemmRunner::run( - nullptr, problem_size, tensor_a.device_ref(), ref_W, - ref_scales, - tensor_c.device_ref(), tensor_d.device_ref()); + nullptr, problem_size, tensor_a.device_ref(), ref_W, + ref_scales, + tensor_c.device_ref(), tensor_d.device_ref()); } ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status)); @@ -275,7 +274,7 @@ void run_blkq4_gemm(int m, int n, int k) { using LayoutInputB = cutlass::layout::ColumnMajor; thrust::device_vector d_dequants(dequants); cutlass::TensorRef ref_B( - d_dequants.data().get(), LayoutInputB::packed(problem_size.kn())); + d_dequants.data().get(), LayoutInputB::packed(problem_size.kn())); cutlass::HostTensor tensor_ref_d( problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from // reference kernel @@ -289,9 +288,9 @@ void run_blkq4_gemm(int m, int n, int k) { ElementComputeEpilogue beta = ElementComputeEpilogue(0); compute_gemm_ref( + ElementInputB, LayoutInputB, + ElementOutput, LayoutOutput, + ElementComputeEpilogue, ElementAccumulator>( problem_size, alpha, tensor_a.device_ref(), @@ -300,17 +299,17 @@ void run_blkq4_gemm(int m, int n, int k) { tensor_c.device_ref(), tensor_ref_d.device_ref()); - // Wait for kernels to finish + //// Wait for kernels to finish cudaDeviceSynchronize(); - // Copy output data from CUTLASS and reference kernel to host for comparison + //// Copy output data from CUTLASS and reference kernel to host for comparison tensor_d.sync_host(); tensor_ref_d.sync_host(); - // Check if output from CUTLASS kernel and reference kernel are equal or not + //// Check if output from CUTLASS kernel and reference kernel are equal or not bool passed = cutlass::reference::host::TensorEquals( - tensor_d.host_view(), - tensor_ref_d.host_view()); + tensor_d.host_view(), + tensor_ref_d.host_view()); ORT_ENFORCE(passed, "Gemm kernel result wrong!"); } diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc index 8dfaaedcbb..72357ec7e0 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc @@ -5,11 +5,14 @@ // extra code in the core of CUDA EP and that code may // 1. slow down performance critical applications and // 2. increase binary size of ORT. -#include -#include "core/providers/cuda/cuda_execution_provider.h" -#include "core/providers/cuda/cuda_allocator.h" -#include "core/providers/cuda/cuda_stream_handle.h" + #include "gtest/gtest.h" +#include + +#include "core/framework/run_options.h" +#include "core/providers/cuda/cuda_allocator.h" +#include "core/providers/cuda/cuda_execution_provider.h" +#include "core/providers/cuda/cuda_stream_handle.h" namespace onnxruntime { namespace cuda { @@ -22,7 +25,7 @@ TEST(TestDeferredRelease, WithArena) { CUDAExecutionProvider ep(info); AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0]; - RunOptions run_opts; + onnxruntime::RunOptions run_opts; run_opts.run_tag = "log1"; // Allocator for call cudaMallocHost and cudaFreeHost // For details, see CUDAPinnedAllocator in cuda_allocator.cc. @@ -54,7 +57,7 @@ TEST(TestDeferredRelease, WithoutArena) { CUDAExecutionProviderInfo info; CUDAExecutionProvider ep(info); - RunOptions run_opts; + onnxruntime::RunOptions run_opts; run_opts.run_tag = "log1"; OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID}; diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc index 96c1e17331..d8384b4327 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc @@ -6,12 +6,11 @@ #include "core/providers/cuda/cuda_provider_factory_creator.h" #include "core/providers/cuda/cuda_provider_options.h" +#include "gtest/gtest.h" #include #include #include "core/common/gsl.h" -#include "gtest/gtest.h" - #include "core/providers/cuda/cuda_execution_provider.h" #include "core/providers/cuda/cuda_execution_provider_info.h" #include "core/providers/cuda/cuda_allocator.h" @@ -64,8 +63,15 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA { void cuda__Impl_Cast(void*, const float*, double*, size_t) override {} - Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); } - void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); } + Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, + const char* msg, const char* file, const int line) override { + return CudaCall(cudaError(retCode), exprString, libName, + cudaError(successCode), msg, file, line); + } + void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, + const char* msg, const char* file, const int line) override { + CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); + } void CopyGpuToCpu(void*, const void*, const size_t, const OrtMemoryInfo&, const OrtMemoryInfo&) override {} @@ -93,19 +99,27 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA { return nullptr; } - std::shared_ptr CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, const OrtArenaCfg*) override { + std::shared_ptr CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, + onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, + const OrtArenaCfg*) override { return nullptr; } void TestAll() override { - // TestAll is the entry point of CUDA EP's insternal tests. + // TestAll is the entry point of CUDA EP's internal tests. // Those internal tests are not directly callable from onnxruntime_test_all // because CUDA EP is a shared library now. // Instead, this is a test provider that implements all the test cases. // onnxruntime_test_all is calling this function through TryGetProviderInfo_CUDA_Test. - int argc = 1; - std::string mock_exe_name = "onnxruntime_providers_cuda_ut"; - char* argv[] = {const_cast(mock_exe_name.data())}; + char mock_exe_name[] = "onnxruntime_providers_cuda_ut"; + + // InitGoogleTest decrements argc and removes args from argv if + // recognized. By doing so it decrements argc and shifts argv, + // to do so, from the code comments it expects argc + 1 with the last one always being nullptr + // otherwise, windows diagnostics reports stack corruption. when + int argc = 1; // Change argc to 2 and edit the filter below if necessary + char* argv[] = {mock_exe_name, nullptr}; + // char* argv[] = {mock_exe_name, "--gtest_filter=ReductionFunctionsTest.*", nullptr}; ::testing::InitGoogleTest(&argc, argv); ORT_ENFORCE(RUN_ALL_TESTS() == 0); } diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc index 9d20bc545d..7468a57184 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "gtest/gtest.h" + #include #include -#include "gtest/gtest.h" - #include "core/common/common.h" #include "core/providers/cuda/shared_inc/cuda_call.h" #include "core/providers/cuda/shared_inc/cuda_utils.h" @@ -32,7 +32,8 @@ void TestFillCorrectness(size_t num_elements, TElement value) { Fill(nullptr, buffer.get(), value, num_elements); auto cpu_buffer = std::make_unique(num_elements); - CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost)); + CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), + cudaMemcpyKind::cudaMemcpyDeviceToHost)); std::vector expected_data(num_elements, value); EXPECT_EQ(std::memcmp(cpu_buffer.get(), expected_data.data(), num_elements * sizeof(TElement)), 0); diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc index 4917701e51..6636e15040 100644 --- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "gtest/gtest.h" + #include "core/common/common.h" #include "core/providers/cuda/cuda_common.h" -#include "gtest/gtest.h" - namespace onnxruntime { namespace cuda { namespace test { diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc index c460e806c1..ec7e985285 100644 --- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include - #include "gtest/gtest.h" +#include + #include "core/providers/cuda/shared_inc/cuda_utils.h" #include "core/common/optional.h" #include "core/providers/cuda/reduction/reduction_functions.h" diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index b7232e9dc4..0e885b71b4 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -150,7 +150,9 @@ stages: --enable_cuda_profiling --enable_cuda_nhwc_ops \ --enable_pybind --build_java \ --use_cache \ - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75; \ + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 \ + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \ + --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \ ccache -sv; \ ccache -z" workingDirectory: $(Build.SourcesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml index eee38ac04b..291e2f4e19 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml @@ -42,7 +42,12 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_cuda.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + additionalBuildFlags: >- + --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" + --enable_cuda_profiling + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON + --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo @@ -59,7 +64,10 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_cuda.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + additionalBuildFlags: >- + --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" + --skip_onnx_tests + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo @@ -95,7 +103,11 @@ stages: EnvSetupScript: setup_env_cuda.bat buildArch: x64 # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags - additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF + additionalBuildFlags: >- + --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda + --cuda_home="$(Agent.TempDirectory)\v11.8" + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo