Build DML in Windows GPU CI pipeline (#22869)

### Description
Add a new stage to build cuda and dml in Windows GPU CI pipeline (PR
checks) to prevent regressions introduced by new cuda tests.
Update all tests in cuda/testcases name prefix to CudaEp for skipping
them easily

### Motivation and Context
1. CudaNhwcEP is added by default when using cuda ep
2. if onnxruntime_ENABLE_CUDA_EP_INTERNAL_TES is enable, the tests in
tests/provider/cuda/testcases is added too.

### To do
add enable_pybind in the new stage.
Now, --enable_pybind will trigger some python test, like
onnxruntime_test_python.py.
It uses the API of get_avaible_providers() .
More discussions are needed to decide how to make it works
This commit is contained in:
Yi Zhang 2024-11-25 10:50:52 +08:00 committed by GitHub
parent a2ba3cb547
commit 85751e7276
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 81 additions and 38 deletions

View file

@ -27,6 +27,7 @@ import java.util.EnumSet;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
public class ProviderOptionsTest {
@ -34,6 +35,7 @@ public class ProviderOptionsTest {
@Test
@EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
@DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
public void testCUDAOptions() throws OrtException {
// Test standard options
OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
@ -61,6 +63,7 @@ public class ProviderOptionsTest {
@Test
@EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
@DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
public void testTensorRT() throws OrtException {
// Test standard options
OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);

View file

@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test();
namespace test {
namespace cuda {
TEST(CUDA_EP_Unittest, All) {
TEST(CudaEpUnittest, All) {
onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test();
ep.TestAll();
}

View file

@ -11,7 +11,7 @@
namespace onnxruntime {
namespace test {
TEST(AllocatorTest, CUDAAllocatorTest) {
TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
OrtDevice::DeviceId cuda_device_id = 0;
// ensure CUDA device is available.
@ -77,7 +77,7 @@ TEST(AllocatorTest, CUDAAllocatorTest) {
}
// test that we fallback to smaller allocations if the growth of the arena exceeds the available memory
TEST(AllocatorTest, CUDAAllocatorFallbackTest) {
TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) {
OrtDevice::DeviceId cuda_device_id = 0;
size_t free = 0;

View file

@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend;
namespace onnxruntime {
namespace test {
TEST(AttentionKernelOptionsTest, NonZeroValue) {
TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
{
AttentionKernelOptions options;
int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
@ -156,7 +156,7 @@ TEST(AttentionKernelOptionsTest, NonZeroValue) {
}
// Test all environment variables take effect when option value is 0.
TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
constexpr int value = 0;
ScopedEnvironmentVariables scoped_env_vars{
EnvVarMap{
@ -186,7 +186,7 @@ TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
}
// Test default min sequence lengths when environment variables are not set.
TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) {
constexpr int value = 0;
ScopedEnvironmentVariables scoped_env_vars{
EnvVarMap{

View file

@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector<float>& values,
}
}
TEST(TestBeamSearch, TopK) {
TEST(CudaEpTestBeamSearch, TopK) {
int32_t batch_size = 4;
int32_t beam_size = 4;
int32_t vocab_size = 50257;

View file

@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) {
}
// TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
TEST(BlkQ4_GEMM, PrepackSm80Test) {
TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
@ -263,7 +263,7 @@ TEST(BlkQ4_GEMM, PrepackSm80Test) {
testPrepack<true, false>(256, 256);
}
TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
@ -292,7 +292,7 @@ TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
}
TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
@ -305,7 +305,7 @@ TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
}
TEST(BlkQ4_GEMM, Sm80SmallMTest) {
TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
@ -326,7 +326,7 @@ TEST(BlkQ4_GEMM, Sm80SmallMTest) {
onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
}
TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported

View file

@ -19,7 +19,7 @@ namespace cuda {
namespace test {
// TODO: Since the "DeferredRelease" has been migrated to CudaStream class,
// we should migrate this test from CudaEP unit test to CudaStream unit test.
TEST(TestDeferredRelease, WithArena) {
TEST(CudaEpTestDeferredRelease, WithArena) {
// Create CUDA EP.
CUDAExecutionProviderInfo info;
CUDAExecutionProvider ep(info);
@ -52,7 +52,7 @@ TEST(TestDeferredRelease, WithArena) {
ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
}
TEST(TestDeferredRelease, WithoutArena) {
TEST(CudaEpTestDeferredRelease, WithoutArena) {
// Create CUDA EP.
CUDAExecutionProviderInfo info;
CUDAExecutionProvider ep(info);

View file

@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
}
} // namespace
TEST(CudaUtilsTest, FillCorrectness) {
TEST(CudaEpUnittest, FillCorrectness) {
TestFillCorrectness<int8_t>(1 << 20, 1);
TestFillCorrectness<int16_t>(1 << 20, 2);
TestFillCorrectness<int32_t>(1 << 20, 3);

View file

@ -10,7 +10,7 @@ namespace onnxruntime {
namespace cuda {
namespace test {
TEST(CudaGemmOptions, TestDefaultOptions) {
TEST(CudaEpGemmOptions, TestDefaultOptions) {
HalfGemmOptions gemm_options;
ASSERT_FALSE(gemm_options.IsCompute16F());
#if defined(USE_CUDA)
@ -22,7 +22,7 @@ TEST(CudaGemmOptions, TestDefaultOptions) {
#endif
}
TEST(CudaGemmOptions, TestCompute16F) {
TEST(CudaEpGemmOptions, TestCompute16F) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(1);
ASSERT_TRUE(gemm_options.IsCompute16F());
@ -35,7 +35,7 @@ TEST(CudaGemmOptions, TestCompute16F) {
#endif
}
TEST(CudaGemmOptions, NoReducedPrecision) {
TEST(CudaEpGemmOptions, NoReducedPrecision) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(2);
ASSERT_FALSE(gemm_options.IsCompute16F());
@ -48,7 +48,7 @@ TEST(CudaGemmOptions, NoReducedPrecision) {
#endif
}
TEST(CudaGemmOptions, Pedantic) {
TEST(CudaEpGemmOptions, Pedantic) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(4);
ASSERT_FALSE(gemm_options.IsCompute16F());
@ -61,7 +61,7 @@ TEST(CudaGemmOptions, Pedantic) {
#endif
}
TEST(CudaGemmOptions, Compute16F_Pedantic) {
TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(5);
ASSERT_TRUE(gemm_options.IsCompute16F());
@ -74,7 +74,7 @@ TEST(CudaGemmOptions, Compute16F_Pedantic) {
#endif
}
TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(3);
ASSERT_TRUE(gemm_options.IsCompute16F());

View file

@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector<float>& values,
}
}
TEST(TestGreedySearch, TopOne) {
TEST(CudaEpTestGreedySearch, TopOne) {
int32_t batch_size = 4;
int32_t vocab_size = 50257;
int32_t batch_x_vocab = batch_size * vocab_size;

View file

@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
}
} // namespace
TEST(ReductionFunctionsTest, ReduceRowToScalar) {
TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
TestReduceRowToScalarApis(3);
TestReduceRowToScalarApis(19);
TestReduceRowToScalarApis(123);
@ -188,7 +188,7 @@ TEST(ReductionFunctionsTest, ReduceRowToScalar) {
TestReduceRowToScalarApis(941736, 2e-4f);
}
TEST(ReductionFunctionsTest, ReduceRowsToRow) {
TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
for (int m : {3, 193, 2945}) {
for (int n : {3, 193, 2945}) {
TestReduceRowsToRow(m, n, true);
@ -197,7 +197,7 @@ TEST(ReductionFunctionsTest, ReduceRowsToRow) {
}
}
TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
for (int m : {3, 193, 2945}) {
for (int n : {3, 193, 2945}) {
TestReduceColumnsToColumn(m, n);
@ -205,7 +205,7 @@ TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
}
}
TEST(ReductionFunctionsTest, BufferOffsets) {
TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
const int m = 2048;
const int n = 1024;
const TensorShape shape{m, n};
@ -240,7 +240,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
}
}
TEST(ReductionFunctionsTest, InvalidBufferSize) {
TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
const int m = 2048;
const int n = 1024;
const TensorShape shape{m, n};
@ -262,7 +262,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
ASSERT_FALSE(status.IsOK());
}
TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) {
auto test_get_applicable_matrix_reduction =
[](cudnnReduceTensorOp_t cudnn_op,
const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,

View file

@ -218,16 +218,32 @@ jobs:
- powershell: |
python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq
Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Install onnxruntime wheel'
- ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
- powershell: |
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Run tests'
- ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}:
- powershell: |
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Run tests excluding CUDA tests'
env:
NO_CUDA_TEST: '1'
GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test
PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls
- powershell: |
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Run tests excluding DML tests'
env:
NO_DML_TEST: '1'
GTEST_FILTER: '-*cpu_*models*'
PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)'
- ${{ else }}:
- powershell: |
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Run tests'
- ${{ if eq(parameters.GenerateDocumentation, true) }}:
- task: PythonScript@0

View file

@ -413,7 +413,7 @@ stages:
workingDirectory: '$(Build.BinariesDirectory)'
env:
NO_CUDA_TEST: '1'
GTEST_FILTER: -*CudaNhwcTypedTest*
GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/
- task: PythonScript@0
displayName: 'test excludes DML'
condition: and(succeeded(), eq('${{ parameters.runTests}}', true))

View file

@ -62,4 +62,28 @@ stages:
RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
ORT_EP_NAME: CUDA
WITH_CACHE: true
MachinePool: onnxruntime-Win2022-GPU-A10
MachinePool: onnxruntime-Win2022-GPU-A10
- stage: cuda_dml
dependsOn: []
jobs:
- template: templates/jobs/win-ci-vs-2022-job.yml
parameters:
BuildConfig: 'RelWithDebInfo'
EnvSetupScript: setup_env_cuda.bat
buildArch: x64
additionalBuildFlags: >-
--build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
--enable_cuda_profiling --enable_transformers_tool_test
--use_dml
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
--cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
--cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
msbuildPlatform: x64
isX86: false
job_name_suffix: x64_RelWithDebInfo
RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
ORT_EP_NAME: CUDA
EnablePython: false
WITH_CACHE: true
MachinePool: onnxruntime-Win2022-GPU-A10

View file

@ -43,11 +43,11 @@ stages:
BuildConfig: 'RelWithDebInfo'
EnvSetupScript: setup_env.bat
buildArch: x64
additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
msbuildPlatform: x64
isX86: false
job_name_suffix: x64_RelWithDebInfo
RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
ORT_EP_NAME: DML
WITH_CACHE: false
MachinePool: onnxruntime-Win2022-GPU-dml-A10
MachinePool: onnxruntime-Win2022-GPU-dml-A10