From b2ed4fb0a48ef7e0444011b4f97b53cac7ba1f65 Mon Sep 17 00:00:00 2001 From: baijumeswani Date: Thu, 10 Jun 2021 15:58:23 -0700 Subject: [PATCH] Merge orttraining and ortmodule single gpu ci pipelines (#8022) * Merge orttraining and ortmodule single gpu ci pipelines * Remove Debug from orttrainer build config --- .../python/orttraining_test_ortmodule_api.py | 2 +- .../orttraining-linux-gpu-ci-pipeline.yml | 8 ++- ...g-linux-gpu-ortmodule-test-ci-pipeline.yml | 64 ------------------- .../azure-pipelines/templates/linux-ci.yml | 10 +++ ...g-linux-gpu-ortmodule-test-ci-pipeline.yml | 39 +++++++++++ 5 files changed, 56 insertions(+), 67 deletions(-) delete mode 100644 tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml create mode 100644 tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index 9a8a387142..28eb17987a 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -2548,7 +2548,7 @@ def test_primitive_inputs(bool_argument, int_argument, float_argument): input1 = torch.randn(N, D_in, device=device) pt_out = pt_model(input1, bool_argument, int_argument, float_argument) ort_out = ort_model(input1, bool_argument, int_argument, float_argument) - assert torch.equal(pt_out, ort_out) + _test_helpers.assert_values_are_close(pt_out, ort_out) @pytest.mark.parametrize("bool_arguments", [(True, False), (False, True)]) def test_changing_bool_input_re_exports_model(bool_arguments): diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml index ae9b2fe2bd..4f8ab9130b 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml @@ -8,6 +8,8 @@ jobs: SubmoduleCheckoutMode: 'recursive' RunDockerBuildArgs: > -o ubuntu20.04 -p 3.8 -d gpu -r $(Build.BinariesDirectory) + -t onnxruntime_orttraining_ortmodule_tests_image + -e -x " --enable_training --config $(buildConfig) @@ -16,6 +18,10 @@ jobs: --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=70 " DoNugetPack: 'false' + RunInjectedPipeline: 'true' + InjectedPipeline: 'orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml' + DockerImageTag: 'onnxruntime_orttraining_ortmodule_tests_image' + BuildConfig: $(buildConfig) ArtifactName: 'drop-linux' TimeoutInMinutes: 120 # Enable unreleased onnx opsets in CI builds @@ -24,8 +30,6 @@ jobs: Strategy: maxParallel: 2 matrix: - Debug: - buildConfig: Debug Release: buildConfig: Release diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml deleted file mode 100644 index 8da4fd73d8..0000000000 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml +++ /dev/null @@ -1,64 +0,0 @@ -trigger: none - -jobs: -- job: Onnxruntime_Linux_GPU_ORTModule_Test - - timeoutInMinutes: 120 - pool: 'Linux-Single-GPU-V100' - - steps: - - checkout: self - clean: true - submodules: recursive - - - template: templates/run-docker-build-steps.yml - parameters: - RunDockerBuildArgs: | - -o ubuntu20.04 -p 3.8 -p 3.8 -d gpu -r $(Build.BinariesDirectory) \ - -t onnxruntime_ortmodule_tests_image \ - -x " \ - --config RelWithDebInfo \ - --enable_training \ - --update --build \ - --build_wheel --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=70 \ - " \ - -u \ - -e - DisplayName: 'Build' - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist" - displayName: 'Mount MNIST' - condition: succeededOrFailed() - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data" - displayName: 'Mount bert-data' - condition: succeededOrFailed() - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" - displayName: 'Mount hf-models-cache' - condition: succeededOrFailed() - - # Entry point for all ORTModule tests - # The onnxruntime folder is deleted in the build directory - # to enforce use of the onnxruntime wheel - - script: | - docker run \ - --gpus all \ - --shm-size=1024m \ - --rm \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory):/build \ - --volume /mnist:/mnist \ - --volume /bert_data:/bert_data \ - --volume /hf_models_cache:/hf_models_cache \ - onnxruntime_ortmodule_tests_image \ - bash -c "python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl ; rm -rf /build/RelWithDebInfo/onnxruntime/ ; /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build/RelWithDebInfo" \ - displayName: 'Run orttraining_ortmodule_tests.py' - condition: succeededOrFailed() - timeoutInMinutes: 60 - - - template: templates/component-governance-component-detection-steps.yml - parameters: - condition: 'succeeded' - - - template: templates/clean-agent-build-directory-step.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml index 5ebe5a4895..c0635d0fee 100644 --- a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml @@ -6,6 +6,10 @@ parameters: DoNodejsPack: 'false' DoNugetPack: 'false' NuPackScript: '' + RunInjectedPipeline: 'false' + InjectedPipeline: '' + DockerImageTag: '' + BuildConfig: '' ArtifactName: 'drop-linux' TimeoutInMinutes: 120 # Controls whether unreleased onnx opsets are allowed. Default is set to 1 @@ -64,6 +68,12 @@ jobs: inputs: artifactName: ${{ parameters.ArtifactName }} targetPath: '$(Build.ArtifactStagingDirectory)' + - ${{ if eq(parameters['RunInjectedPipeline'], 'true') }}: + - template: | + ${{ parameters.InjectedPipeline }} + parameters: + DockerImageTag: ${{ parameters.DockerImageTag }} + BuildConfig: ${{ parameters.BuildConfig }} - template: component-governance-component-detection-steps.yml parameters : condition : 'succeeded' diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml new file mode 100644 index 0000000000..6855f0fffb --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml @@ -0,0 +1,39 @@ +parameters: +- name: DockerImageTag + type: string +- name: BuildConfig + type: string + +steps: + +- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist" + displayName: 'Mount MNIST' + condition: succeededOrFailed() + +- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data" + displayName: 'Mount bert-data' + condition: succeededOrFailed() + +- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" + displayName: 'Mount hf-models-cache' + condition: succeededOrFailed() + + # Entry point for all ORTModule tests + # The onnxruntime folder is deleted in the build directory + # to enforce use of the onnxruntime wheel + # Uninstall orttraining requirements.txt and install ortmodule requirements.txt before running tests. +- script: | + docker run \ + --gpus all \ + --shm-size=1024m \ + --rm \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \ + --volume /mnist:/mnist \ + --volume /bert_data:/bert_data \ + --volume /hf_models_cache:/hf_models_cache \ + ${{ parameters.DockerImageTag }} \ + bash -c "python3 -m pip uninstall -y -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/requirements.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cu11.1.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt && python3 -m pip install /build/dist/onnxruntime*.whl && rm -rf /build/onnxruntime/ && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build" \ + displayName: 'Run orttraining_ortmodule_tests.py' + condition: succeededOrFailed() + timeoutInMinutes: 60