Build DML in Windows GPU CI pipeline (#22869)

### Description Add a new stage to build cuda and dml in Windows GPU CI pipeline (PR checks) to prevent regressions introduced by new cuda tests. Update all tests in cuda/testcases name prefix to CudaEp for skipping them easily ### Motivation and Context 1. CudaNhwcEP is added by default when using cuda ep 2. if onnxruntime_ENABLE_CUDA_EP_INTERNAL_TES is enable, the tests in tests/provider/cuda/testcases is added too. ### To do add enable_pybind in the new stage. Now, --enable_pybind will trigger some python test, like onnxruntime_test_python.py. It uses the API of get_avaible_providers() . More discussions are needed to decide how to make it works
2026-05-14 20:48:00 +00:00 · 2024-11-25 10:50:52 +08:00 · 2024-11-25 10:50:52 +08:00 · 85751e7276
commit 85751e7276
parent a2ba3cb547
15 changed files with 81 additions and 38 deletions
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@ -27,6 +27,7 @@ import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.Map;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;

 public class ProviderOptionsTest {
@ -34,6 +35,7 @@ public class ProviderOptionsTest {

  @Test
  @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
+  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
  public void testCUDAOptions() throws OrtException {
    // Test standard options
    OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
@ -61,6 +63,7 @@ public class ProviderOptionsTest {

  @Test
  @EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
+  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
  public void testTensorRT() throws OrtException {
    // Test standard options
    OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);
--- a/onnxruntime/test/providers/cuda/cuda_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test();

 namespace test {
 namespace cuda {
-TEST(CUDA_EP_Unittest, All) {
+TEST(CudaEpUnittest, All) {
  onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test();
  ep.TestAll();
 }
--- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace test {

-TEST(AllocatorTest, CUDAAllocatorTest) {
+TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
  OrtDevice::DeviceId cuda_device_id = 0;

  // ensure CUDA device is available.
@ -77,7 +77,7 @@ TEST(AllocatorTest, CUDAAllocatorTest) {
 }

 // test that we fallback to smaller allocations if the growth of the arena exceeds the available memory
-TEST(AllocatorTest, CUDAAllocatorFallbackTest) {
+TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) {
  OrtDevice::DeviceId cuda_device_id = 0;

  size_t free = 0;
--- a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend;
 namespace onnxruntime {
 namespace test {

-TEST(AttentionKernelOptionsTest, NonZeroValue) {
+TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
  {
    AttentionKernelOptions options;
    int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
@ -156,7 +156,7 @@ TEST(AttentionKernelOptionsTest, NonZeroValue) {
 }

 // Test all environment variables take effect when option value is 0.
-TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
+TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
  constexpr int value = 0;
  ScopedEnvironmentVariables scoped_env_vars{
      EnvVarMap{
@ -186,7 +186,7 @@ TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
 }

 // Test default min sequence lengths when environment variables are not set.
-TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
+TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) {
  constexpr int value = 0;
  ScopedEnvironmentVariables scoped_env_vars{
      EnvVarMap{
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector<float>& values,
  }
 }

-TEST(TestBeamSearch, TopK) {
+TEST(CudaEpTestBeamSearch, TopK) {
  int32_t batch_size = 4;
  int32_t beam_size = 4;
  int32_t vocab_size = 50257;
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) {
 }

 // TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
-TEST(BlkQ4_GEMM, PrepackSm80Test) {
+TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
  Status status = onnxruntime::cuda::test::sm80_supported();
  if (!status.IsOK()) {
    // skip the test if sm80 is not supported
@ -263,7 +263,7 @@ TEST(BlkQ4_GEMM, PrepackSm80Test) {
  testPrepack<true, false>(256, 256);
 }

-TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
  Status status = onnxruntime::cuda::test::sm80_supported();
  if (!status.IsOK()) {
    // skip the test if sm80 is not supported
@ -292,7 +292,7 @@ TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
  onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
 }

-TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
  Status status = onnxruntime::cuda::test::sm80_supported();
  if (!status.IsOK()) {
    // skip the test if sm80 is not supported
@ -305,7 +305,7 @@ TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
  onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
 }

-TEST(BlkQ4_GEMM, Sm80SmallMTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
  Status status = onnxruntime::cuda::test::sm80_supported();
  if (!status.IsOK()) {
    // skip the test if sm80 is not supported
@ -326,7 +326,7 @@ TEST(BlkQ4_GEMM, Sm80SmallMTest) {
  onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
 }

-TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) {
  Status status = onnxruntime::cuda::test::sm80_supported();
  if (!status.IsOK()) {
    // skip the test if sm80 is not supported
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@ -19,7 +19,7 @@ namespace cuda {
 namespace test {
 // TODO: Since the "DeferredRelease" has been migrated to CudaStream class,
 // we should migrate this test from CudaEP unit test to CudaStream unit test.
-TEST(TestDeferredRelease, WithArena) {
+TEST(CudaEpTestDeferredRelease, WithArena) {
  // Create CUDA EP.
  CUDAExecutionProviderInfo info;
  CUDAExecutionProvider ep(info);
@ -52,7 +52,7 @@ TEST(TestDeferredRelease, WithArena) {
  ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }

-TEST(TestDeferredRelease, WithoutArena) {
+TEST(CudaEpTestDeferredRelease, WithoutArena) {
  // Create CUDA EP.
  CUDAExecutionProviderInfo info;
  CUDAExecutionProvider ep(info);
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
 }
 }  // namespace

-TEST(CudaUtilsTest, FillCorrectness) {
+TEST(CudaEpUnittest, FillCorrectness) {
  TestFillCorrectness<int8_t>(1 << 20, 1);
  TestFillCorrectness<int16_t>(1 << 20, 2);
  TestFillCorrectness<int32_t>(1 << 20, 3);
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace cuda {
 namespace test {

-TEST(CudaGemmOptions, TestDefaultOptions) {
+TEST(CudaEpGemmOptions, TestDefaultOptions) {
  HalfGemmOptions gemm_options;
  ASSERT_FALSE(gemm_options.IsCompute16F());
 #if defined(USE_CUDA)
@ -22,7 +22,7 @@ TEST(CudaGemmOptions, TestDefaultOptions) {
 #endif
 }

-TEST(CudaGemmOptions, TestCompute16F) {
+TEST(CudaEpGemmOptions, TestCompute16F) {
  HalfGemmOptions gemm_options;
  gemm_options.Initialize(1);
  ASSERT_TRUE(gemm_options.IsCompute16F());
@ -35,7 +35,7 @@ TEST(CudaGemmOptions, TestCompute16F) {
 #endif
 }

-TEST(CudaGemmOptions, NoReducedPrecision) {
+TEST(CudaEpGemmOptions, NoReducedPrecision) {
  HalfGemmOptions gemm_options;
  gemm_options.Initialize(2);
  ASSERT_FALSE(gemm_options.IsCompute16F());
@ -48,7 +48,7 @@ TEST(CudaGemmOptions, NoReducedPrecision) {
 #endif
 }

-TEST(CudaGemmOptions, Pedantic) {
+TEST(CudaEpGemmOptions, Pedantic) {
  HalfGemmOptions gemm_options;
  gemm_options.Initialize(4);
  ASSERT_FALSE(gemm_options.IsCompute16F());
@ -61,7 +61,7 @@ TEST(CudaGemmOptions, Pedantic) {
 #endif
 }

-TEST(CudaGemmOptions, Compute16F_Pedantic) {
+TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
  HalfGemmOptions gemm_options;
  gemm_options.Initialize(5);
  ASSERT_TRUE(gemm_options.IsCompute16F());
@ -74,7 +74,7 @@ TEST(CudaGemmOptions, Compute16F_Pedantic) {
 #endif
 }

-TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
+TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) {
  HalfGemmOptions gemm_options;
  gemm_options.Initialize(3);
  ASSERT_TRUE(gemm_options.IsCompute16F());
--- a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector<float>& values,
  }
 }

-TEST(TestGreedySearch, TopOne) {
+TEST(CudaEpTestGreedySearch, TopOne) {
  int32_t batch_size = 4;
  int32_t vocab_size = 50257;
  int32_t batch_x_vocab = batch_size * vocab_size;
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
 }
 }  // namespace

-TEST(ReductionFunctionsTest, ReduceRowToScalar) {
+TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
  TestReduceRowToScalarApis(3);
  TestReduceRowToScalarApis(19);
  TestReduceRowToScalarApis(123);
@ -188,7 +188,7 @@ TEST(ReductionFunctionsTest, ReduceRowToScalar) {
  TestReduceRowToScalarApis(941736, 2e-4f);
 }

-TEST(ReductionFunctionsTest, ReduceRowsToRow) {
+TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
  for (int m : {3, 193, 2945}) {
    for (int n : {3, 193, 2945}) {
      TestReduceRowsToRow(m, n, true);
@ -197,7 +197,7 @@ TEST(ReductionFunctionsTest, ReduceRowsToRow) {
  }
 }

-TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
+TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
  for (int m : {3, 193, 2945}) {
    for (int n : {3, 193, 2945}) {
      TestReduceColumnsToColumn(m, n);
@ -205,7 +205,7 @@ TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
  }
 }

-TEST(ReductionFunctionsTest, BufferOffsets) {
+TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
  const int m = 2048;
  const int n = 1024;
  const TensorShape shape{m, n};
@ -240,7 +240,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
  }
 }

-TEST(ReductionFunctionsTest, InvalidBufferSize) {
+TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
  const int m = 2048;
  const int n = 1024;
  const TensorShape shape{m, n};
@ -262,7 +262,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
  ASSERT_FALSE(status.IsOK());
 }

-TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
+TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) {
  auto test_get_applicable_matrix_reduction =
      [](cudnnReduceTensorOp_t cudnn_op,
         const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@ -218,16 +218,32 @@ jobs:
      - powershell: |
         python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq
         Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
-
        workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
        displayName: 'Install onnxruntime wheel'

  - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
-      - powershell: |
-         python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022"  --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-
-        workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-        displayName: 'Run tests'
+      - ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}:
+        - powershell: |
+           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests excluding CUDA tests'
+          env:
+            NO_CUDA_TEST: '1'
+            GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test
+            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls
+        - powershell: |
+            python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests excluding DML tests'
+          env:
+            NO_DML_TEST: '1'
+            GTEST_FILTER: '-*cpu_*models*'
+            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)'
+      - ${{ else }}:
+        - powershell: |
+           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests'

  - ${{ if eq(parameters.GenerateDocumentation, true) }}:
    - task: PythonScript@0
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@ -413,7 +413,7 @@ stages:
              workingDirectory: '$(Build.BinariesDirectory)'
            env:
              NO_CUDA_TEST: '1'
-              GTEST_FILTER: -*CudaNhwcTypedTest*
+              GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/
          - task: PythonScript@0
            displayName: 'test excludes DML'
            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
--- a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
@ -62,4 +62,28 @@ stages:
        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
        ORT_EP_NAME: CUDA
        WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
+        MachinePool: onnxruntime-Win2022-GPU-A10
+
+- stage: cuda_dml
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda.bat
+        buildArch: x64
+        additionalBuildFlags: >-
+          --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
+          --enable_cuda_profiling --enable_transformers_tool_test
+          --use_dml
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        ORT_EP_NAME: CUDA
+        EnablePython: false
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-GPU-A10
--- a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
@ -43,11 +43,11 @@ stages:
        BuildConfig: 'RelWithDebInfo'
        EnvSetupScript: setup_env.bat
        buildArch: x64
-        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
+        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
        msbuildPlatform: x64
        isX86: false
        job_name_suffix: x64_RelWithDebInfo
        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
        ORT_EP_NAME: DML
        WITH_CACHE: false
-        MachinePool: onnxruntime-Win2022-GPU-dml-A10
+        MachinePool: onnxruntime-Win2022-GPU-dml-A10