From cde723a1361ab2c66879cebb3515d5e5eef763fa Mon Sep 17 00:00:00 2001 From: liqunfu Date: Mon, 14 Dec 2020 12:43:41 -0800 Subject: [PATCH] Liqun/move nightly pl to linux multi gpu v100 (#6024) * move e2e nightly pipeline to azure devop Co-authored-by: liqun --- .../python/orttraining_run_bert_pretrain.py | 1 + ...rttraining_run_frontend_batch_size_test.py | 2 ++ ...ng-linux-gpu-e2e-test-nightly-pipeline.yml | 33 +++++++++---------- .../azure_scale_set_vm_mount_test_data.sh | 30 +++++++++++++++++ 4 files changed, 49 insertions(+), 17 deletions(-) create mode 100755 tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh diff --git a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py index 5a88bc497e..d3b725fe0f 100644 --- a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py +++ b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py @@ -93,6 +93,7 @@ def create_pretraining_dataset(input_file, max_pred_length, args): class pretraining_dataset(Dataset): def __init__(self, input_file, max_pred_length): + logger.info("pretraining_dataset: %s, max_pred_length: %d", input_file, max_pred_length) self.input_file = input_file self.max_pred_length = max_pred_length f = h5py.File(input_file, "r") diff --git a/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py b/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py index 3c8dfea216..b49fc3c95c 100644 --- a/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py +++ b/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py @@ -48,6 +48,8 @@ def run_with_config(config): cmds.append("--attn_dropout_recompute") if config.transformer_layer_recompute: cmds.append("--transformer_layer_recompute") + + # access to azure storage shared disk is much slower so we need a longer timeout. subprocess.run(cmds, timeout=1200).check_returncode() for config in configs: diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml index ec8d3695d5..4699b6f4f2 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml @@ -1,29 +1,16 @@ trigger: none jobs: -- job: Onnxruntime_Linux_GPU_Training_E2E_Test_Nightly +- job: Orttraining_Linux_GPU_Training_E2E_Test_Nightly timeoutInMinutes: 120 + pool: 'Linux-Multi-GPU-V100-E2E3' steps: - checkout: self clean: true submodules: recursive - # update these if the E2E test data changes - - script: | - orttraining/tools/ci_test/download_azure_blob_archive.py \ - --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \ - --target_dir $(Build.BinariesDirectory)/training_e2e_test_data \ - --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9 - displayName: 'Download onnxruntime_training_data.zip data' - - - script: | - orttraining/tools/ci_test/download_azure_blob_archive.py \ - --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/glue_MRPC_data.zip \ - --target_dir /bert_data/hf_data/ - displayName: 'Download glue_MRPC_data.zip data' - - template: templates/run-docker-build-steps.yml parameters: RunDockerBuildArgs: | @@ -40,6 +27,18 @@ jobs: -m DisplayName: 'Build' + # update these if the E2E test data changes + - script: | + orttraining/tools/ci_test/download_azure_blob_archive.py \ + --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \ + --target_dir $(Build.BinariesDirectory)/training_e2e_test_data \ + --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9 + displayName: 'Download onnxruntime_training_data.zip data' + + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh $(orttrainingtestdata-storage-key) + displayName: 'Map test data' + condition: succeededOrFailed() # ensure all tests are run + # Hit OOM with run_training_pipeline_e2e_tests.py - slightly above 16GB limit. # leave this code here for further investigation. # https://msdata.visualstudio.com/Vienna/_workitems/edit/956642 @@ -204,7 +203,7 @@ jobs: --env CUDA_VISIBLE_DEVICES 2 displayName: 'Run orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag' condition: succeededOrFailed() # ensure all tests are run - timeoutInMinutes: 10 + timeoutInMinutes: 30 - script: | docker run \ @@ -252,5 +251,5 @@ jobs: displayName: 'Run orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 20 - + - template: templates/clean-agent-build-directory-step.yml diff --git a/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh b/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh new file mode 100755 index 0000000000..ec30518c6f --- /dev/null +++ b/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh @@ -0,0 +1,30 @@ +#! /usr/bin/env bash +if [ -d "/bert_data" ]; then +sudo umount /bert_data +fi +if [ -d "/bert_data" ]; then +sudo rm -rf /bert_data +fi + +sudo mkdir /bert_data + +if [ ! -d "/etc/smbcredentials" ]; then +sudo mkdir /etc/smbcredentials +fi + +if [ -f "/etc/smbcredentials/orttrainingtestdata.cred" ]; then +sudo rm /etc/smbcredentials/orttrainingtestdata.cred +fi + +# to create orttrainingtestdata.cred, I have to do: 'sudo bash -c ...' +sudo bash -c 'echo "username=orttrainingtestdata" >> /etc/smbcredentials/orttrainingtestdata.cred' + +# $1 get removed (do defend injection attack?) if I do 'sudo bash -c...' +# to enable 'sudo echo...' I need to 'sudo chmod 777...' first. +sudo chmod 777 /etc/smbcredentials/orttrainingtestdata.cred +sudo echo "password=$1" >> /etc/smbcredentials/orttrainingtestdata.cred + +sudo chmod 600 /etc/smbcredentials/orttrainingtestdata.cred + +sudo bash -c 'echo "//orttrainingtestdata.file.core.windows.net/bert-data /bert_data cifs nofail,vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdata.cred,dir_mode=0777,file_mode=0777,serverino" >> /etc/fstab' +sudo mount -t cifs //orttrainingtestdata.file.core.windows.net/bert-data /bert_data -o vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdata.cred,dir_mode=0777,file_mode=0777,serverino