onnxruntime/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
Dmitri Smirnov b95fd4e644
Enable CUDA EP unit testing on Windows (#20039)
### Description
Address build issues and source code discrepancies.
Fix cuda_test_provider gtest argument stack corruption.

### Motivation and Context
`OpTester` class that is widely used for kernel testing is not
suitable for testing internal classes for EPs that are built as shared
objects.
Currently, CUDA EP tests run only on Linux.
We want to enable testing and developments on Windows,
and create a usable pattern for testing of other EPs internals.

Alternatives considered: 
Abstracting EP unit tests into separate test executable such as
`onnxruntime_test_all`.
This alternative was rejected as it would create a lot more changes in
the established patterns,
and potentially interfere with CUDA functionality with more complex
source code maintanence.
2024-03-27 13:32:36 -07:00

274 lines
10 KiB
YAML

##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
trigger:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
pr:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
#### end trigger ####
parameters:
- name: CudaVersion
displayName: CUDA version
type: string
default: '11.8'
values:
- 11.8
- 12.2
- name: SpecificArtifact
displayName: Use Specific Artifact
type: boolean
default: false
- name: BuildId
displayName: Specific Artifact's BuildId
type: string
default: '0'
resources:
repositories:
- repository: manylinux
type: Github
endpoint: Microsoft
name: pypa/manylinux
ref: 5eda9aded5462201e6310105728d33016e637ea7
variables:
- name: docker_base_image
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
- name: linux_trt_version
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: 8.6.1.6-1.cuda11.8
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: 8.6.1.6-1.cuda12.0
- name: Repository
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: 'onnxruntimecuda11build'
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: 'onnxruntimecuda12build'
stages:
- stage: Linux_Build
jobs:
- job: Linux_Build
timeoutInMinutes: 120
variables:
skipComponentGovernanceDetection: true
CCACHE_DIR: $(Pipeline.Workspace)/ccache
workspace:
clean: all
pool: onnxruntime-Ubuntu2204-AMD-CPU
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()
- checkout: self
clean: true
submodules: none
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "
--network=host
--build-arg BASEIMAGE=$(docker_base_image)
--build-arg TRT_VERSION=$(linux_trt_version)
--build-arg BUILD_UID=$( id -u )
"
Repository: $(Repository)
- task: Cache@2
inputs:
key: '"ccache" | "${{parameters.CudaVersion}}" |"$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
path: $(CCACHE_DIR)
restoreKeys: |
"ccache" | "${{parameters.CudaVersion}}" | "$(Build.SourceBranch)"
"ccache"
cacheHitVar: CACHE_RESTORED
displayName: Cach Task
- script: |
sudo mkdir -p $(Pipeline.Workspace)/ccache
condition: ne(variables.CACHE_RESTORED, 'true')
displayName: Create Cache Dir
- script: |
set -e -x
mkdir -p $HOME/.onnx
docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
--volume $(Pipeline.Workspace)/ccache:/cache \
-e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
-e CCACHE_DIR=/cache \
$(Repository) \
/bin/bash -c "
set -ex; \
env; \
ccache -s; \
/opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
--build_dir /build --cmake_generator Ninja \
--config Release --update --build \
--skip_submodule_sync \
--build_shared_lib \
--parallel --use_binskim_compliant_compile_flags \
--build_wheel \
--enable_onnx_tests --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda-${{parameters.CudaVersion}} --cudnn_home=/usr/local/cuda-${{parameters.CudaVersion}} \
--enable_cuda_profiling --enable_cuda_nhwc_ops \
--enable_pybind --build_java \
--use_cache \
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 \
--cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \
--cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \
ccache -sv; \
ccache -z"
workingDirectory: $(Build.SourcesDirectory)
displayName: Build Onnxruntime
- task: CmdLine@2
inputs:
script: |
rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
rm -f $(Build.BinariesDirectory)/Release/models
find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete
cd $(Build.BinariesDirectory)/Release
find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
- task: PublishPipelineArtifact@0
displayName: 'Publish Pipeline Artifact'
inputs:
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'
- template: templates/explicitly-defined-final-tasks.yml
- stage: Linux_Test
dependsOn:
- Linux_Build
jobs:
- job: Linux_Test
timeoutInMinutes: 180
variables:
skipComponentGovernanceDetection: true
workspace:
clean: all
pool: onnxruntime-Linux-GPU-T4
steps:
- checkout: self
clean: true
submodules: none
- template: templates/flex-downloadPipelineArtifact.yml
parameters:
ArtifactName: 'drop-linux'
StepName: 'Download Pipeline Artifact - Linux Build'
TargetPath: '$(Build.BinariesDirectory)/Release'
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "
--network=host
--build-arg BASEIMAGE=$(docker_base_image)
--build-arg TRT_VERSION=$(linux_trt_version)
--build-arg BUILD_UID=$( id -u )
"
Repository: $(Repository)
- task: CmdLine@2
inputs:
script: |
set -e -x
mkdir -p $HOME/.onnx
docker run --gpus all --rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory)/Release:/build/Release \
--volume /data/models:/build/models:ro \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
--volume /data/onnx:/data/onnx \
-e NVIDIA_TF32_OVERRIDE=0 \
$(Repository) \
/bin/bash -c '
nvidia-smi; \
/sbin/ldconfig -N -v $(sed "s/:/ /" <<< $LD_LIBRARY_PATH) 2>/dev/null | grep -E "libcudart.so|libcudnn.so|libnvinfer.so"; \
cat /usr/local/cuda/include/cuda.h | grep -m1 CUDA_VERSION; \
cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -m1 -A 2; \
ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
/tmp/python3 -m pip install /build/Release/dist/*.whl; \
/tmp/python3 -u -c "from onnxruntime.capi._pybind_state import (OrtDevice as C_OrtDevice) ; \
ort_device = C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0); \
print(ort_device); print(ort_device.device_type(), C_OrtDevice.cuda()); \
assert(ort_device.device_type()==1); assert(C_OrtDevice.cuda()==1);" \
'
displayName: 'Check GPU'
- task: CmdLine@2
inputs:
script: |
set -e -x
mkdir -p $HOME/.onnx
docker run --gpus all --shm-size=1g --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory)/Release:/build/Release \
--volume /data/models:/build/models:ro \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
--volume /data/onnx:/data/onnx \
-e NVIDIA_TF32_OVERRIDE=0 \
$(Repository) \
/bin/bash -c '
set -ex; \
cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \
ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
/tmp/python3 -m pip install -r /tmp/requirements.txt; \
/tmp/python3 -m pip install /build/Release/dist/*.whl; \
cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \
cd /onnxruntime_src/java && /onnxruntime_src/java/gradlew cmakeCheck -DcmakeBuildDir=/build/Release -DUSE_CUDA=1; \
cd /tmp; \
/tmp/python3 /onnxruntime_src/tools/ci_build/build.py \
--build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_wheel --enable_onnx_tests \
--use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
--enable_pybind --build_java --ctest_path "" ; \
'
displayName: 'Run Tests'
- template: templates/check_test_result.yml
parameters:
FileName: '$(Build.BinariesDirectory)/Release/onnxruntime_test_all.Release.results.xml'
- template: templates/clean-agent-build-directory-step.yml