Merge orttraining and ortmodule single gpu ci pipelines (#8022)

* Merge orttraining and ortmodule single gpu ci pipelines

* Remove Debug from orttrainer build config
This commit is contained in:
baijumeswani 2021-06-10 15:58:23 -07:00 committed by GitHub
parent 4d1b48632c
commit b2ed4fb0a4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 56 additions and 67 deletions

View file

@ -2548,7 +2548,7 @@ def test_primitive_inputs(bool_argument, int_argument, float_argument):
input1 = torch.randn(N, D_in, device=device)
pt_out = pt_model(input1, bool_argument, int_argument, float_argument)
ort_out = ort_model(input1, bool_argument, int_argument, float_argument)
assert torch.equal(pt_out, ort_out)
_test_helpers.assert_values_are_close(pt_out, ort_out)
@pytest.mark.parametrize("bool_arguments", [(True, False), (False, True)])
def test_changing_bool_input_re_exports_model(bool_arguments):

View file

@ -8,6 +8,8 @@ jobs:
SubmoduleCheckoutMode: 'recursive'
RunDockerBuildArgs: >
-o ubuntu20.04 -p 3.8 -d gpu -r $(Build.BinariesDirectory)
-t onnxruntime_orttraining_ortmodule_tests_image
-e
-x "
--enable_training
--config $(buildConfig)
@ -16,6 +18,10 @@ jobs:
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=70
"
DoNugetPack: 'false'
RunInjectedPipeline: 'true'
InjectedPipeline: 'orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml'
DockerImageTag: 'onnxruntime_orttraining_ortmodule_tests_image'
BuildConfig: $(buildConfig)
ArtifactName: 'drop-linux'
TimeoutInMinutes: 120
# Enable unreleased onnx opsets in CI builds
@ -24,8 +30,6 @@ jobs:
Strategy:
maxParallel: 2
matrix:
Debug:
buildConfig: Debug
Release:
buildConfig: Release

View file

@ -1,64 +0,0 @@
trigger: none
jobs:
- job: Onnxruntime_Linux_GPU_ORTModule_Test
timeoutInMinutes: 120
pool: 'Linux-Single-GPU-V100'
steps:
- checkout: self
clean: true
submodules: recursive
- template: templates/run-docker-build-steps.yml
parameters:
RunDockerBuildArgs: |
-o ubuntu20.04 -p 3.8 -p 3.8 -d gpu -r $(Build.BinariesDirectory) \
-t onnxruntime_ortmodule_tests_image \
-x " \
--config RelWithDebInfo \
--enable_training \
--update --build \
--build_wheel --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=70 \
" \
-u \
-e
DisplayName: 'Build'
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist"
displayName: 'Mount MNIST'
condition: succeededOrFailed()
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data"
displayName: 'Mount bert-data'
condition: succeededOrFailed()
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
displayName: 'Mount hf-models-cache'
condition: succeededOrFailed()
# Entry point for all ORTModule tests
# The onnxruntime folder is deleted in the build directory
# to enforce use of the onnxruntime wheel
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /mnist:/mnist \
--volume /bert_data:/bert_data \
--volume /hf_models_cache:/hf_models_cache \
onnxruntime_ortmodule_tests_image \
bash -c "python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl ; rm -rf /build/RelWithDebInfo/onnxruntime/ ; /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build/RelWithDebInfo" \
displayName: 'Run orttraining_ortmodule_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 60
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- template: templates/clean-agent-build-directory-step.yml

View file

@ -6,6 +6,10 @@ parameters:
DoNodejsPack: 'false'
DoNugetPack: 'false'
NuPackScript: ''
RunInjectedPipeline: 'false'
InjectedPipeline: ''
DockerImageTag: ''
BuildConfig: ''
ArtifactName: 'drop-linux'
TimeoutInMinutes: 120
# Controls whether unreleased onnx opsets are allowed. Default is set to 1
@ -64,6 +68,12 @@ jobs:
inputs:
artifactName: ${{ parameters.ArtifactName }}
targetPath: '$(Build.ArtifactStagingDirectory)'
- ${{ if eq(parameters['RunInjectedPipeline'], 'true') }}:
- template: |
${{ parameters.InjectedPipeline }}
parameters:
DockerImageTag: ${{ parameters.DockerImageTag }}
BuildConfig: ${{ parameters.BuildConfig }}
- template: component-governance-component-detection-steps.yml
parameters :
condition : 'succeeded'

View file

@ -0,0 +1,39 @@
parameters:
- name: DockerImageTag
type: string
- name: BuildConfig
type: string
steps:
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist"
displayName: 'Mount MNIST'
condition: succeededOrFailed()
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data"
displayName: 'Mount bert-data'
condition: succeededOrFailed()
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
displayName: 'Mount hf-models-cache'
condition: succeededOrFailed()
# Entry point for all ORTModule tests
# The onnxruntime folder is deleted in the build directory
# to enforce use of the onnxruntime wheel
# Uninstall orttraining requirements.txt and install ortmodule requirements.txt before running tests.
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \
--volume /mnist:/mnist \
--volume /bert_data:/bert_data \
--volume /hf_models_cache:/hf_models_cache \
${{ parameters.DockerImageTag }} \
bash -c "python3 -m pip uninstall -y -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/requirements.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cu11.1.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt && python3 -m pip install /build/dist/onnxruntime*.whl && rm -rf /build/onnxruntime/ && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build" \
displayName: 'Run orttraining_ortmodule_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 60