mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-01 23:30:35 +00:00
Liqun/move nightly pl to linux multi gpu v100 (#6024)
* move e2e nightly pipeline to azure devop Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
This commit is contained in:
parent
dd2e5a1a05
commit
cde723a136
4 changed files with 49 additions and 17 deletions
|
|
@ -93,6 +93,7 @@ def create_pretraining_dataset(input_file, max_pred_length, args):
|
|||
class pretraining_dataset(Dataset):
|
||||
|
||||
def __init__(self, input_file, max_pred_length):
|
||||
logger.info("pretraining_dataset: %s, max_pred_length: %d", input_file, max_pred_length)
|
||||
self.input_file = input_file
|
||||
self.max_pred_length = max_pred_length
|
||||
f = h5py.File(input_file, "r")
|
||||
|
|
|
|||
|
|
@ -48,6 +48,8 @@ def run_with_config(config):
|
|||
cmds.append("--attn_dropout_recompute")
|
||||
if config.transformer_layer_recompute:
|
||||
cmds.append("--transformer_layer_recompute")
|
||||
|
||||
# access to azure storage shared disk is much slower so we need a longer timeout.
|
||||
subprocess.run(cmds, timeout=1200).check_returncode()
|
||||
|
||||
for config in configs:
|
||||
|
|
|
|||
|
|
@ -1,29 +1,16 @@
|
|||
trigger: none
|
||||
|
||||
jobs:
|
||||
- job: Onnxruntime_Linux_GPU_Training_E2E_Test_Nightly
|
||||
- job: Orttraining_Linux_GPU_Training_E2E_Test_Nightly
|
||||
|
||||
timeoutInMinutes: 120
|
||||
pool: 'Linux-Multi-GPU-V100-E2E3'
|
||||
|
||||
steps:
|
||||
- checkout: self
|
||||
clean: true
|
||||
submodules: recursive
|
||||
|
||||
# update these if the E2E test data changes
|
||||
- script: |
|
||||
orttraining/tools/ci_test/download_azure_blob_archive.py \
|
||||
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
|
||||
--target_dir $(Build.BinariesDirectory)/training_e2e_test_data \
|
||||
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
|
||||
displayName: 'Download onnxruntime_training_data.zip data'
|
||||
|
||||
- script: |
|
||||
orttraining/tools/ci_test/download_azure_blob_archive.py \
|
||||
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/glue_MRPC_data.zip \
|
||||
--target_dir /bert_data/hf_data/
|
||||
displayName: 'Download glue_MRPC_data.zip data'
|
||||
|
||||
- template: templates/run-docker-build-steps.yml
|
||||
parameters:
|
||||
RunDockerBuildArgs: |
|
||||
|
|
@ -40,6 +27,18 @@ jobs:
|
|||
-m
|
||||
DisplayName: 'Build'
|
||||
|
||||
# update these if the E2E test data changes
|
||||
- script: |
|
||||
orttraining/tools/ci_test/download_azure_blob_archive.py \
|
||||
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
|
||||
--target_dir $(Build.BinariesDirectory)/training_e2e_test_data \
|
||||
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
|
||||
displayName: 'Download onnxruntime_training_data.zip data'
|
||||
|
||||
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh $(orttrainingtestdata-storage-key)
|
||||
displayName: 'Map test data'
|
||||
condition: succeededOrFailed() # ensure all tests are run
|
||||
|
||||
# Hit OOM with run_training_pipeline_e2e_tests.py - slightly above 16GB limit.
|
||||
# leave this code here for further investigation.
|
||||
# https://msdata.visualstudio.com/Vienna/_workitems/edit/956642
|
||||
|
|
@ -204,7 +203,7 @@ jobs:
|
|||
--env CUDA_VISIBLE_DEVICES 2
|
||||
displayName: 'Run orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag'
|
||||
condition: succeededOrFailed() # ensure all tests are run
|
||||
timeoutInMinutes: 10
|
||||
timeoutInMinutes: 30
|
||||
|
||||
- script: |
|
||||
docker run \
|
||||
|
|
@ -252,5 +251,5 @@ jobs:
|
|||
displayName: 'Run orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence'
|
||||
condition: succeededOrFailed() # ensure all tests are run
|
||||
timeoutInMinutes: 20
|
||||
|
||||
|
||||
- template: templates/clean-agent-build-directory-step.yml
|
||||
|
|
|
|||
|
|
@ -0,0 +1,30 @@
|
|||
#! /usr/bin/env bash
|
||||
if [ -d "/bert_data" ]; then
|
||||
sudo umount /bert_data
|
||||
fi
|
||||
if [ -d "/bert_data" ]; then
|
||||
sudo rm -rf /bert_data
|
||||
fi
|
||||
|
||||
sudo mkdir /bert_data
|
||||
|
||||
if [ ! -d "/etc/smbcredentials" ]; then
|
||||
sudo mkdir /etc/smbcredentials
|
||||
fi
|
||||
|
||||
if [ -f "/etc/smbcredentials/orttrainingtestdata.cred" ]; then
|
||||
sudo rm /etc/smbcredentials/orttrainingtestdata.cred
|
||||
fi
|
||||
|
||||
# to create orttrainingtestdata.cred, I have to do: 'sudo bash -c ...'
|
||||
sudo bash -c 'echo "username=orttrainingtestdata" >> /etc/smbcredentials/orttrainingtestdata.cred'
|
||||
|
||||
# $1 get removed (do defend injection attack?) if I do 'sudo bash -c...'
|
||||
# to enable 'sudo echo...' I need to 'sudo chmod 777...' first.
|
||||
sudo chmod 777 /etc/smbcredentials/orttrainingtestdata.cred
|
||||
sudo echo "password=$1" >> /etc/smbcredentials/orttrainingtestdata.cred
|
||||
|
||||
sudo chmod 600 /etc/smbcredentials/orttrainingtestdata.cred
|
||||
|
||||
sudo bash -c 'echo "//orttrainingtestdata.file.core.windows.net/bert-data /bert_data cifs nofail,vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdata.cred,dir_mode=0777,file_mode=0777,serverino" >> /etc/fstab'
|
||||
sudo mount -t cifs //orttrainingtestdata.file.core.windows.net/bert-data /bert_data -o vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdata.cred,dir_mode=0777,file_mode=0777,serverino
|
||||
Loading…
Reference in a new issue