diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index ee1959bb35..3293506141 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -76,7 +76,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead. -cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF) +cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF) option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF) option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index aeeac10ead..1346a9ce96 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -122,7 +122,7 @@ endif() if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS) # cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and - # add to the lib onnxruntime_providers_cuda separatedly. + # added to the lib onnxruntime_providers_cuda separately. # onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc. set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc) list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src}) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 1ffb838328..4a351dcf90 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -779,6 +779,13 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS) onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock) target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey) target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common) + if (MSVC) + # Cutlass code has an issue with the following: + # warning C4100: 'magic': unreferenced formal parameter + target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$:SHELL:--compiler-options /wd4100>" + "$<$>:/wd4100>") + endif() + list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut) endif() diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index 40ca96a19a..16ad943a5f 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -53,7 +53,7 @@ struct NodeComputeInfo { DestroyFunctionStateFunc release_state_func; }; -using RunOptions = OrtRunOptions; +using RunOptions = ::OrtRunOptions; enum class DataLayout { NCHW, diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h index 5444c825d7..789c3b13f2 100644 --- a/include/onnxruntime/core/framework/run_options.h +++ b/include/onnxruntime/core/framework/run_options.h @@ -45,5 +45,5 @@ struct OrtRunOptions { }; namespace onnxruntime { -using RunOptions = OrtRunOptions; +using RunOptions = ::OrtRunOptions; } // namespace onnxruntime diff --git a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h index a08cfb97ee..c81b4967d2 100644 --- a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h +++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h @@ -110,8 +110,8 @@ struct BlockwiseQuantization { static void prepack_weights( int rows, int columns, - const gsl::span& weights, // <- int4 weights, column major - const gsl::span& weights_prepacked // <- int4 prepacked weights tensor, same size buffer + gsl::span weights, // <- int4 weights, column major + gsl::span weights_prepacked // <- int4 prepacked weights tensor, same size buffer ) { ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 && (rows % QuantBlocking::kRow) == 0 && @@ -171,10 +171,10 @@ struct BlockwiseQuantization { static void prepack_quant_scales( size_t rows, size_t columns, - const gsl::span& scales, // <- quant scales, column major layout - const gsl::span& scales_prepacked // <- quant scales prepacked, same size buffer + gsl::span scales, // <- quant scales, column major layout + gsl::span scales_prepacked // <- quant scales prepacked, same size buffer ) { - auto meta_shape = get_quant_meta_shape(rows, columns); + auto meta_shape = get_quant_meta_shape(static_cast(rows), static_cast(columns)); ORT_ENFORCE(scales.size() == size_t(meta_shape.product()), "Quantization scale tensor shape mismatch!"); ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()), @@ -241,10 +241,10 @@ struct BlockwiseQuantization { static void prepack_quant_offsets( size_t rows, size_t columns, - const gsl::span& offsets, // <- quant offsets, int4, column major layout - const gsl::span& offsets_prepacked // <- quant offsets prepacked, double size buffer + gsl::span offsets, // <- quant offsets, int4, column major layout + gsl::span offsets_prepacked // <- quant offsets prepacked, double size buffer ) { - auto meta_shape = get_quant_meta_shape(rows, columns); + auto meta_shape = get_quant_meta_shape(static_cast(rows), static_cast(columns)); ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0, "Does not support odd number of rows or columns!"); diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h index 8b6bac8c50..28364cc34f 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h @@ -132,7 +132,7 @@ struct DummyType{ } CUTLASS_HOST_DEVICE - std::monostate& operator[](int idx) { + std::monostate& operator[](int /*idx */) { return dummy_; } }; diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h index 4ba39dda3d..26239161cf 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h @@ -437,7 +437,7 @@ public: CUTLASS_HOST_DEVICE static void dequant(FragmentScale const &scales, - FragmentOffset const &offsets, + FragmentOffset const &fragment_offsets, Array const &weights, Array& dest){ static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm."); @@ -453,19 +453,18 @@ public: uint32_t* dest_pair = reinterpret_cast(dest.data()); const b64* scales_ptr = reinterpret_cast(scales.data()); - const ElementOffset* offsets_ptr = nullptr; - if constexpr(kHasOffset) { offsets_ptr = offsets.data(); } + [[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr; + if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); } CUTLASS_PRAGMA_UNROLL for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){ // dequantize: d = scale * (weight - offset) // to use FMA, d = scale * weight + (scale * (-offset)) - b64 offsets; - if constexpr(kHasOffset){ - const uint32_t* p = reinterpret_cast(offsets_ptr); - + [[maybe_unused]] b64 offsets{0}; + if constexpr(kHasOffset) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + const uint32_t* p = reinterpret_cast(fragment_offsets_ptr); asm volatile( "{\n\t" " .reg .b32 rb0, rb1;\n" // b32 regs for fp16x2 mul operands @@ -486,7 +485,7 @@ public: assert(0); #endif - offsets_ptr += 4; + fragment_offsets_ptr += 4; } else { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) asm volatile( @@ -541,7 +540,7 @@ public: int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma; ElementScale s = scales[idx]; if constexpr(kHasOffset){ - offset = s * static_cast(-16 - int(offsets[idx])); + offset = s * static_cast(-16 - static_cast(fragment_offsets[idx])); } else { offset = s * static_cast(-16-8); } @@ -795,13 +794,13 @@ public: } } } else if constexpr (kMmaIterationsB % 2 == 0) { - const uint32_t* scales_ptr = reinterpret_cast(scales.data()); - uint32_t* addon_ptr = reinterpret_cast(addon); - if constexpr (kHasOffset){ +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + const uint32_t* scales_ptr = reinterpret_cast(scales.data()); + uint32_t* addon_ptr = reinterpret_cast(addon); // possible buffer over read 2 bytes here. const uint32_t* p = reinterpret_cast(offsets.data()); -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + asm volatile( "{\n\t" " .reg .b32 rb0, rb1, rb2;\n" diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index bdad18c7ed..3bb938c1a3 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -394,14 +394,6 @@ struct ConfigOptions final { PROVIDER_DISALLOW_ALL(ConfigOptions) }; -struct OrtRunOptions final { - const ConfigOptions& GetConfigOptions() const { - return g_host->RunOptions__GetConfigOptions(this); - } - - PROVIDER_DISALLOW_ALL(OrtRunOptions) -}; - struct ComputeCapability final { static std::unique_ptr Create(std::unique_ptr t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); } static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast(p)); } @@ -1283,3 +1275,10 @@ template <> inline gsl::span Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); } } // namespace onnxruntime + +struct OrtRunOptions final { + const onnxruntime::ConfigOptions& GetConfigOptions() const { + return onnxruntime::g_host->RunOptions__GetConfigOptions(this); + } + PROVIDER_DISALLOW_ALL(OrtRunOptions) +}; diff --git a/onnxruntime/core/util/matrix_layout.h b/onnxruntime/core/util/matrix_layout.h index 783a29d8a2..43843da3fb 100644 --- a/onnxruntime/core/util/matrix_layout.h +++ b/onnxruntime/core/util/matrix_layout.h @@ -378,7 +378,7 @@ class MatrixRef { MatrixRef( NonConstMatrixRef const& ref, ///< MatrixRef to non-const data /// SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const - _Magic magic = (typename std::enable_if::type)0 + [[maybe_unused]] _Magic magic = (typename std::enable_if::type)0 ) : data_(ref.data()), shape_(ref.shape()), layout_(Layout::packed(ref.shape())) {} ORT_FORCEINLINE diff --git a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h index 6ea8b55505..942b1c4d2c 100644 --- a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h +++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h @@ -74,7 +74,8 @@ inline void sm80_prepack_quant_scales_ref( int columns, const MatrixRef& tensor_scale, const MatrixRef& tensor_scale_prepacked) { - ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn), + ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == + (columns / QuantBlocking::kColumn), "Unexpected tensor_scale shape! Expected: (", rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")"); ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape()); @@ -84,7 +85,9 @@ inline void sm80_prepack_quant_scales_ref( // 2 B operand tiles per mma instruction stacked on k dimension // (1,n) quantization blocking if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) { - ORT_THROW("sm80_prepack_quant_scales_ref should only be called for row-wise block quantization on 16b float values."); + ORT_THROW( + "sm80_prepack_quant_scales_ref should only be called for " + " row-wise block quantization on 16b float values."); } // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc index 9fecec9f7e..a0d115c41c 100644 --- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc +++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc @@ -80,7 +80,8 @@ TEST(TestBeamSearch, TopK) { std::vector top_k_values_ref(batch_size * k); std::vector top_k_tokens_ref(batch_size * k); std::vector top_k_indices_ref(batch_size * k); - ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k); + ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, + beam_size, vocab_size, k); const int32_t max_vocab_parts = 128; size_t buffer_size = batch_x_beam_x_vocab * 4 // input diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h index bbe370675f..f0dfaf1a58 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h @@ -14,12 +14,14 @@ #pragma once -#include +#include "test/cuda_host/blkq4_fp16_quant_sm80.h" + +#include +#include -#include "core/util/matrix_layout.h" #include "core/common/common.h" #include "core/mickey/blk_q4/f16_prepack_sm80.h" -#include "test/cuda_host/blkq4_fp16_quant_sm80.h" +#include "core/util/matrix_layout.h" namespace onnxruntime { namespace cuda { @@ -48,10 +50,10 @@ Status sm80_supported(); template inline void blkq4_weights_gen( int rows, int columns, - std::vector& dequants, - std::vector& q_weights, - std::vector& q_scales, - std::vector& q_zp) { + thrust::host_vector& dequants, + thrust::host_vector& q_weights, + thrust::host_vector& q_scales, + thrust::host_vector& q_zp) { using Base = onnxruntime::cuda::BlockwiseQuantization< ElementT, block_size, @@ -74,7 +76,7 @@ inline void blkq4_weights_gen( const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns); const auto meta_shape = Base::get_quant_meta_shape(rows, columns); - const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); + [[maybe_unused]] const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); // // For testing quantization and dequantization, it is not straight @@ -120,9 +122,9 @@ inline void blkq4_weights_gen( q_scales.resize(meta_shape.product()); for (size_t i = 0; i < q_scales.size(); i++) { - uint32_t v = dis(gen); - uint32_t m = (v % 63) + 1; - uint32_t e = (v >> 6) % 4; + uint32_t vl = dis(gen); + uint32_t m = (vl % 63) + 1; + uint32_t e = (vl >> 6) % 4; q_scales[i] = ElementT(m / static_cast(1 << (2 + e))); } MatrixRef tensor_scale( diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc index e687ae73e6..e7fa0dae02 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc @@ -11,15 +11,15 @@ * well with CUTLASS headers. */ +#include "blkq4_fp16_gemm_sm80.h" + +#include "gtest/gtest.h" +#include #include #include "core/framework/float16.h" #include "core/mlas/inc/mlas_q4.h" -#include "blkq4_fp16_gemm_sm80.h" - -#include "gtest/gtest.h" - namespace onnxruntime { namespace test { @@ -43,10 +43,10 @@ void testPrepack(int rows, int columns) { const auto meta_shape = Base::get_quant_meta_shape(rows, columns); const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]); - std::vector q_weights; - std::vector q_scales; - std::vector q_zp; - std::vector dequants; + thrust::host_vector q_weights; + thrust::host_vector q_scales; + thrust::host_vector q_zp; + thrust::host_vector dequants; onnxruntime::cuda::test::blkq4_weights_gen( rows, columns, dequants, q_weights, q_scales, q_zp); diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu index 69c929d446..210c33933d 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu @@ -11,9 +11,11 @@ * well with gtest headers. */ +#include "blkq4_fp16_gemm_sm80.h" + #include -#include #include +#include #include "core/mickey/blk_q4/f16_gemm_sm80.h" @@ -26,13 +28,11 @@ #include "core/common/common.h" -#include "blkq4_fp16_gemm_sm80.h" - namespace onnxruntime { -namespace cuda{ -namespace test{ +namespace cuda { +namespace test { -Status sm80_supported(){ +Status sm80_supported() { cudaDeviceProp props; cudaError_t error = cudaGetDeviceProperties(&props, 0); @@ -55,27 +55,25 @@ Status sm80_supported(){ * Copied directly from cutlass util/reference/device/gemm.h * for the strange reason that compiler insists on asking * for explicit stream argument in kernel launch. -*/ + */ template < - typename ElementA, - typename LayoutA, - typename ElementB, - typename LayoutB, - typename ElementC, - typename LayoutC, - typename ScalarType, - typename AccumulatorType -> + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ScalarType, + typename AccumulatorType> void compute_gemm_ref( - cutlass::gemm::GemmCoord problem_size, - ScalarType alpha, - cutlass::TensorRef tensor_a, - cutlass::TensorRef tensor_b, - ScalarType beta, - cutlass::TensorRef tensor_c, - cutlass::TensorRef tensor_d, - AccumulatorType initial_accum = AccumulatorType(0)) { - + cutlass::gemm::GemmCoord problem_size, + ScalarType alpha, + cutlass::TensorRef tensor_a, + cutlass::TensorRef tensor_b, + ScalarType beta, + cutlass::TensorRef tensor_c, + cutlass::TensorRef tensor_d, + AccumulatorType initial_accum = AccumulatorType(0)) { // Blocking structure potentially improves performance of reference implementation // with a minor increase in complexity. // @@ -85,30 +83,27 @@ void compute_gemm_ref( dim3 block(16, 8); dim3 grid( - (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow), - (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn) - ); + (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow), + (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)); // Launch a GEMM kernel cutlass::reference::device::kernel::Gemm< - cutlass::TensorRef, - cutlass::TensorRef, - cutlass::TensorRef, - ScalarType, - AccumulatorType, - OutputTile, - cutlass::multiply_add, - cutlass::NumericConverter - ><<>>( - problem_size, - alpha, - tensor_a, - tensor_b, - beta, - tensor_c, - tensor_d, - initial_accum - ); + cutlass::TensorRef, + cutlass::TensorRef, + cutlass::TensorRef, + ScalarType, + AccumulatorType, + OutputTile, + cutlass::multiply_add, + cutlass::NumericConverter><<>>( + problem_size, + alpha, + tensor_a, + tensor_b, + beta, + tensor_c, + tensor_d, + initial_accum); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -117,28 +112,31 @@ void compute_gemm_ref( // template < - typename Element, - typename LayoutCutlass, - typename Layout = std::conditional_t::value, ColumnMajorLayout, RowMajorLayout> - > + typename Element, + typename LayoutCutlass, + typename Layout = std::conditional_t::value, + ColumnMajorLayout, RowMajorLayout>> __forceinline__ -MatrixRef make_MatrixRef(cutlass::HostTensor const& tensor) { - static_assert(std::is_same::value - || std::is_same::value); + MatrixRef + make_MatrixRef(cutlass::HostTensor const& tensor) { + static_assert(std::is_same::value || + std::is_same::value); auto shape = make_Position(tensor.extent().row(), tensor.extent().column()); - auto* ptr = const_cast::type *>(tensor.host_data()); + auto* ptr = const_cast::type*>(tensor.host_data()); return MatrixRef(ptr, tensor.capacity(), shape); } template < - typename Element, - typename LayoutCutlass, - typename Layout = std::conditional_t::value, ColumnMajorLayout, RowMajorLayout> - > + typename Element, + typename LayoutCutlass, + typename Layout = std::conditional_t::value, + ColumnMajorLayout, RowMajorLayout>> __forceinline__ -MatrixRef make_ConstMatrixRef(cutlass::HostTensor const& tensor) { - static_assert(std::is_same::value - || std::is_same::value); + MatrixRef + make_ConstMatrixRef(cutlass::HostTensor const& tensor) { + static_assert(std::is_same::value || + std::is_same::value); auto shape = make_Position(tensor.extent().row(), tensor.extent().column()); return MatrixRef(tensor.host_data(), tensor.capacity(), shape); } @@ -147,7 +145,7 @@ MatrixRef make_ConstMatrixRef(cutlass::HostTensor, - cutlass::MatrixShape<1, block_size>>::type; + typename std::conditional, + cutlass::MatrixShape<1, block_size>>::type; using GemmRunner = BlkQ4F16GemmImpl; @@ -181,17 +179,18 @@ void run_blkq4_gemm(int m, int n, int k) { using LayoutInputQScale = typename GemmRunner::LayoutInputQScale; const cutlass::gemm::GemmCoord problem_size = {m, n, k}; - const auto q_weight_shape = cutlass::make_Coord(problem_size.k()/2, problem_size.n()); - const auto meta_shape = cutlass::make_Coord(problem_size.k()/QuantBlocking::kRow, problem_size.n()/QuantBlocking::kColumn); + const auto q_weight_shape = cutlass::make_Coord(problem_size.k() / 2, problem_size.n()); + const auto meta_shape = cutlass::make_Coord(problem_size.k() / QuantBlocking::kRow, problem_size.n() / + QuantBlocking::kColumn); // // Generate quantized and dequantizeed input matrix B [K, N] // static_assert(std::is_same::value); - std::vector q_weights; - std::vector q_scales; - std::vector q_zp; - std::vector dequants; + thrust::host_vector q_weights; + thrust::host_vector q_scales; + thrust::host_vector q_zp; + thrust::host_vector dequants; onnxruntime::cuda::test::blkq4_weights_gen( problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp); @@ -201,11 +200,11 @@ void run_blkq4_gemm(int m, int n, int k) { 4, column_wise_blocking>; - std::vector packed_w(q_weight_shape.product()); + thrust::host_vector packed_w(q_weight_shape.product()); PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w); - std::vector packed_scales(meta_shape.product()); + thrust::host_vector packed_scales(meta_shape.product()); PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales); - std::vector packed_zp; + thrust::host_vector packed_zp; if constexpr (has_offsets) { packed_zp.resize(meta_shape.product()); PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp); @@ -240,16 +239,16 @@ void run_blkq4_gemm(int m, int n, int k) { // thrust::device_vector d_packed_w(packed_w); cutlass::TensorRef ref_W( - reinterpret_cast(d_packed_w.data().get()), - LayoutInputWPack::packed({problem_size.k()/2, problem_size.n()/2})); + reinterpret_cast(d_packed_w.data().get()), + LayoutInputWPack::packed({problem_size.k() / 2, problem_size.n() / 2})); thrust::device_vector d_packed_scales(packed_scales); cutlass::TensorRef ref_scales( - d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape)); + d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape)); thrust::device_vector d_packed_zp(packed_zp); cutlass::TensorRef ref_zp( - d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape)); + d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape)); tensor_a.sync_device(); tensor_c.sync_device(); @@ -257,16 +256,16 @@ void run_blkq4_gemm(int m, int n, int k) { // run GEMM cutlass::Status status; - if constexpr (has_offsets){ + if constexpr (has_offsets) { status = GemmRunner::run( - nullptr, problem_size, tensor_a.device_ref(), ref_W, - ref_scales, ref_zp, - tensor_c.device_ref(), tensor_d.device_ref()); + nullptr, problem_size, tensor_a.device_ref(), ref_W, + ref_scales, ref_zp, + tensor_c.device_ref(), tensor_d.device_ref()); } else { status = GemmRunner::run( - nullptr, problem_size, tensor_a.device_ref(), ref_W, - ref_scales, - tensor_c.device_ref(), tensor_d.device_ref()); + nullptr, problem_size, tensor_a.device_ref(), ref_W, + ref_scales, + tensor_c.device_ref(), tensor_d.device_ref()); } ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status)); @@ -275,7 +274,7 @@ void run_blkq4_gemm(int m, int n, int k) { using LayoutInputB = cutlass::layout::ColumnMajor; thrust::device_vector d_dequants(dequants); cutlass::TensorRef ref_B( - d_dequants.data().get(), LayoutInputB::packed(problem_size.kn())); + d_dequants.data().get(), LayoutInputB::packed(problem_size.kn())); cutlass::HostTensor tensor_ref_d( problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from // reference kernel @@ -289,9 +288,9 @@ void run_blkq4_gemm(int m, int n, int k) { ElementComputeEpilogue beta = ElementComputeEpilogue(0); compute_gemm_ref( + ElementInputB, LayoutInputB, + ElementOutput, LayoutOutput, + ElementComputeEpilogue, ElementAccumulator>( problem_size, alpha, tensor_a.device_ref(), @@ -300,17 +299,17 @@ void run_blkq4_gemm(int m, int n, int k) { tensor_c.device_ref(), tensor_ref_d.device_ref()); - // Wait for kernels to finish + //// Wait for kernels to finish cudaDeviceSynchronize(); - // Copy output data from CUTLASS and reference kernel to host for comparison + //// Copy output data from CUTLASS and reference kernel to host for comparison tensor_d.sync_host(); tensor_ref_d.sync_host(); - // Check if output from CUTLASS kernel and reference kernel are equal or not + //// Check if output from CUTLASS kernel and reference kernel are equal or not bool passed = cutlass::reference::host::TensorEquals( - tensor_d.host_view(), - tensor_ref_d.host_view()); + tensor_d.host_view(), + tensor_ref_d.host_view()); ORT_ENFORCE(passed, "Gemm kernel result wrong!"); } diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc index 8dfaaedcbb..72357ec7e0 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc @@ -5,11 +5,14 @@ // extra code in the core of CUDA EP and that code may // 1. slow down performance critical applications and // 2. increase binary size of ORT. -#include -#include "core/providers/cuda/cuda_execution_provider.h" -#include "core/providers/cuda/cuda_allocator.h" -#include "core/providers/cuda/cuda_stream_handle.h" + #include "gtest/gtest.h" +#include + +#include "core/framework/run_options.h" +#include "core/providers/cuda/cuda_allocator.h" +#include "core/providers/cuda/cuda_execution_provider.h" +#include "core/providers/cuda/cuda_stream_handle.h" namespace onnxruntime { namespace cuda { @@ -22,7 +25,7 @@ TEST(TestDeferredRelease, WithArena) { CUDAExecutionProvider ep(info); AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0]; - RunOptions run_opts; + onnxruntime::RunOptions run_opts; run_opts.run_tag = "log1"; // Allocator for call cudaMallocHost and cudaFreeHost // For details, see CUDAPinnedAllocator in cuda_allocator.cc. @@ -54,7 +57,7 @@ TEST(TestDeferredRelease, WithoutArena) { CUDAExecutionProviderInfo info; CUDAExecutionProvider ep(info); - RunOptions run_opts; + onnxruntime::RunOptions run_opts; run_opts.run_tag = "log1"; OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID}; diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc index 96c1e17331..d8384b4327 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc @@ -6,12 +6,11 @@ #include "core/providers/cuda/cuda_provider_factory_creator.h" #include "core/providers/cuda/cuda_provider_options.h" +#include "gtest/gtest.h" #include #include #include "core/common/gsl.h" -#include "gtest/gtest.h" - #include "core/providers/cuda/cuda_execution_provider.h" #include "core/providers/cuda/cuda_execution_provider_info.h" #include "core/providers/cuda/cuda_allocator.h" @@ -64,8 +63,15 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA { void cuda__Impl_Cast(void*, const float*, double*, size_t) override {} - Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); } - void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); } + Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, + const char* msg, const char* file, const int line) override { + return CudaCall(cudaError(retCode), exprString, libName, + cudaError(successCode), msg, file, line); + } + void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, + const char* msg, const char* file, const int line) override { + CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); + } void CopyGpuToCpu(void*, const void*, const size_t, const OrtMemoryInfo&, const OrtMemoryInfo&) override {} @@ -93,19 +99,27 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA { return nullptr; } - std::shared_ptr CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, const OrtArenaCfg*) override { + std::shared_ptr CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, + onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, + const OrtArenaCfg*) override { return nullptr; } void TestAll() override { - // TestAll is the entry point of CUDA EP's insternal tests. + // TestAll is the entry point of CUDA EP's internal tests. // Those internal tests are not directly callable from onnxruntime_test_all // because CUDA EP is a shared library now. // Instead, this is a test provider that implements all the test cases. // onnxruntime_test_all is calling this function through TryGetProviderInfo_CUDA_Test. - int argc = 1; - std::string mock_exe_name = "onnxruntime_providers_cuda_ut"; - char* argv[] = {const_cast(mock_exe_name.data())}; + char mock_exe_name[] = "onnxruntime_providers_cuda_ut"; + + // InitGoogleTest decrements argc and removes args from argv if + // recognized. By doing so it decrements argc and shifts argv, + // to do so, from the code comments it expects argc + 1 with the last one always being nullptr + // otherwise, windows diagnostics reports stack corruption. when + int argc = 1; // Change argc to 2 and edit the filter below if necessary + char* argv[] = {mock_exe_name, nullptr}; + // char* argv[] = {mock_exe_name, "--gtest_filter=ReductionFunctionsTest.*", nullptr}; ::testing::InitGoogleTest(&argc, argv); ORT_ENFORCE(RUN_ALL_TESTS() == 0); } diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc index 9d20bc545d..7468a57184 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "gtest/gtest.h" + #include #include -#include "gtest/gtest.h" - #include "core/common/common.h" #include "core/providers/cuda/shared_inc/cuda_call.h" #include "core/providers/cuda/shared_inc/cuda_utils.h" @@ -32,7 +32,8 @@ void TestFillCorrectness(size_t num_elements, TElement value) { Fill(nullptr, buffer.get(), value, num_elements); auto cpu_buffer = std::make_unique(num_elements); - CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost)); + CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), + cudaMemcpyKind::cudaMemcpyDeviceToHost)); std::vector expected_data(num_elements, value); EXPECT_EQ(std::memcmp(cpu_buffer.get(), expected_data.data(), num_elements * sizeof(TElement)), 0); diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc index 4917701e51..6636e15040 100644 --- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "gtest/gtest.h" + #include "core/common/common.h" #include "core/providers/cuda/cuda_common.h" -#include "gtest/gtest.h" - namespace onnxruntime { namespace cuda { namespace test { diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc index c460e806c1..ec7e985285 100644 --- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include - #include "gtest/gtest.h" +#include + #include "core/providers/cuda/shared_inc/cuda_utils.h" #include "core/common/optional.h" #include "core/providers/cuda/reduction/reduction_functions.h" diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index b7232e9dc4..0e885b71b4 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -150,7 +150,9 @@ stages: --enable_cuda_profiling --enable_cuda_nhwc_ops \ --enable_pybind --build_java \ --use_cache \ - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75; \ + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 \ + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \ + --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \ ccache -sv; \ ccache -z" workingDirectory: $(Build.SourcesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml index eee38ac04b..291e2f4e19 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml @@ -42,7 +42,12 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_cuda.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + additionalBuildFlags: >- + --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" + --enable_cuda_profiling + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON + --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo @@ -59,7 +64,10 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_cuda.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + additionalBuildFlags: >- + --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" + --skip_onnx_tests + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo @@ -95,7 +103,11 @@ stages: EnvSetupScript: setup_env_cuda.bat buildArch: x64 # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags - additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF + additionalBuildFlags: >- + --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda + --cuda_home="$(Agent.TempDirectory)\v11.8" + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo