Liqun/move nightly pl to linux multi gpu v100 (#6024)

* move e2e nightly pipeline to azure devop
Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
This commit is contained in:
liqunfu 2020-12-14 12:43:41 -08:00 committed by GitHub
parent dd2e5a1a05
commit cde723a136
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 49 additions and 17 deletions

View file

@ -93,6 +93,7 @@ def create_pretraining_dataset(input_file, max_pred_length, args):
class pretraining_dataset(Dataset):
def __init__(self, input_file, max_pred_length):
logger.info("pretraining_dataset: %s, max_pred_length: %d", input_file, max_pred_length)
self.input_file = input_file
self.max_pred_length = max_pred_length
f = h5py.File(input_file, "r")

View file

@ -48,6 +48,8 @@ def run_with_config(config):
cmds.append("--attn_dropout_recompute")
if config.transformer_layer_recompute:
cmds.append("--transformer_layer_recompute")
# access to azure storage shared disk is much slower so we need a longer timeout.
subprocess.run(cmds, timeout=1200).check_returncode()
for config in configs:

View file

@ -1,29 +1,16 @@
trigger: none
jobs:
- job: Onnxruntime_Linux_GPU_Training_E2E_Test_Nightly
- job: Orttraining_Linux_GPU_Training_E2E_Test_Nightly
timeoutInMinutes: 120
pool: 'Linux-Multi-GPU-V100-E2E3'
steps:
- checkout: self
clean: true
submodules: recursive
# update these if the E2E test data changes
- script: |
orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
--target_dir $(Build.BinariesDirectory)/training_e2e_test_data \
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
displayName: 'Download onnxruntime_training_data.zip data'
- script: |
orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/glue_MRPC_data.zip \
--target_dir /bert_data/hf_data/
displayName: 'Download glue_MRPC_data.zip data'
- template: templates/run-docker-build-steps.yml
parameters:
RunDockerBuildArgs: |
@ -40,6 +27,18 @@ jobs:
-m
DisplayName: 'Build'
# update these if the E2E test data changes
- script: |
orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
--target_dir $(Build.BinariesDirectory)/training_e2e_test_data \
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
displayName: 'Download onnxruntime_training_data.zip data'
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh $(orttrainingtestdata-storage-key)
displayName: 'Map test data'
condition: succeededOrFailed() # ensure all tests are run
# Hit OOM with run_training_pipeline_e2e_tests.py - slightly above 16GB limit.
# leave this code here for further investigation.
# https://msdata.visualstudio.com/Vienna/_workitems/edit/956642
@ -204,7 +203,7 @@ jobs:
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
timeoutInMinutes: 30
- script: |
docker run \
@ -252,5 +251,5 @@ jobs:
displayName: 'Run orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 20
- template: templates/clean-agent-build-directory-step.yml

View file

@ -0,0 +1,30 @@
#! /usr/bin/env bash
if [ -d "/bert_data" ]; then
sudo umount /bert_data
fi
if [ -d "/bert_data" ]; then
sudo rm -rf /bert_data
fi
sudo mkdir /bert_data
if [ ! -d "/etc/smbcredentials" ]; then
sudo mkdir /etc/smbcredentials
fi
if [ -f "/etc/smbcredentials/orttrainingtestdata.cred" ]; then
sudo rm /etc/smbcredentials/orttrainingtestdata.cred
fi
# to create orttrainingtestdata.cred, I have to do: 'sudo bash -c ...'
sudo bash -c 'echo "username=orttrainingtestdata" >> /etc/smbcredentials/orttrainingtestdata.cred'
# $1 get removed (do defend injection attack?) if I do 'sudo bash -c...'
# to enable 'sudo echo...' I need to 'sudo chmod 777...' first.
sudo chmod 777 /etc/smbcredentials/orttrainingtestdata.cred
sudo echo "password=$1" >> /etc/smbcredentials/orttrainingtestdata.cred
sudo chmod 600 /etc/smbcredentials/orttrainingtestdata.cred
sudo bash -c 'echo "//orttrainingtestdata.file.core.windows.net/bert-data /bert_data cifs nofail,vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdata.cred,dir_mode=0777,file_mode=0777,serverino" >> /etc/fstab'
sudo mount -t cifs //orttrainingtestdata.file.core.windows.net/bert-data /bert_data -o vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdata.cred,dir_mode=0777,file_mode=0777,serverino