From 2beb873c6bbf174e66f072d88ebd765bb94c6ff1 Mon Sep 17 00:00:00 2001 From: liqun Fu Date: Wed, 18 Aug 2021 18:36:19 -0700 Subject: [PATCH] move training CI agent pools to 1ES hosted (#8775) --- .../orttraining-linux-gpu-ci-pipeline.yml | 2 +- ...linux-gpu-distributed-e2e-test-pipeline.yml | 4 ++-- ...-ortmodule-distributed-test-ci-pipeline.yml | 4 ++-- ...gpu-ortmodule-test-clear-cache-pipeline.yml | 2 +- ...raining-linux-gpu-perf-test-ci-pipeline.yml | 4 ++-- ...ng-linux-gpu-ortmodule-test-ci-pipeline.yml | 6 +++--- .../py-packaging-training-cuda-stage.yml | 6 +++--- .../azure_scale_set_vm_mount_test_data.sh | 18 +++++++++--------- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml index cf96ea585f..e474982625 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml @@ -3,7 +3,7 @@ trigger: none jobs: - template: templates/linux-ci.yml parameters: - AgentPool : 'Linux-Single-GPU-V100' + AgentPool : 'Onnxruntime-Linux-GPU-NC6sv3' JobName: 'Onnxruntime_Linux_GPU_Training' SubmoduleCheckoutMode: 'recursive' RunDockerBuildArgs: > diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-e2e-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-e2e-test-pipeline.yml index 6135f751f2..5f1cd9b219 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-e2e-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-e2e-test-pipeline.yml @@ -4,7 +4,7 @@ jobs: - job: Orttraining_Linux_GPU_Distributed_E2E_Test timeoutInMinutes: 180 - pool: 'Linux-Multi-GPU-V100-E2E3' + pool: 'Onnxruntime-Linux-GPU-NC24sv3' steps: - checkout: self @@ -33,7 +33,7 @@ jobs: --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9 displayName: 'Download onnxruntime_training_data.zip data' - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data" + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" displayName: 'Mount bert-data' condition: succeededOrFailed() diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml index fc521c8f4a..8cf65d7d0b 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml @@ -4,7 +4,7 @@ jobs: - job: Onnxruntime_Linux_GPU_ORTModule_Distributed_Test timeoutInMinutes: 120 - pool: 'Linux-Multi-GPU-V100' + pool: 'Onnxruntime-Linux-GPU-NC24sv3' steps: - checkout: self @@ -29,7 +29,7 @@ jobs: -e DisplayName: 'Build' - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist" + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" displayName: 'Mount MNIST' condition: succeededOrFailed() diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-clear-cache-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-clear-cache-pipeline.yml index 7e7d7253c0..e2937b2aee 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-clear-cache-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-clear-cache-pipeline.yml @@ -11,7 +11,7 @@ jobs: clean: true submodules: recursive - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" displayName: 'Mount hf-models-cache' condition: succeededOrFailed() diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-perf-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-perf-test-ci-pipeline.yml index 2ea8bc4068..bb56fb7c34 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-perf-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-perf-test-ci-pipeline.yml @@ -25,11 +25,11 @@ jobs: " DisplayName: 'Build performance tests' - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data" + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" displayName: 'Mount bert-data' condition: succeededOrFailed() # ensure all tests are run - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/gpt2-data" -d "/gpt2_data" + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/gpt2-data" -d "/gpt2_data" displayName: 'Mount gpt2 test data' condition: succeededOrFailed() # ensure all tests are run diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml index 4bf5a97b66..759ab5f58c 100644 --- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml @@ -6,15 +6,15 @@ parameters: steps: -- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist" +- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" displayName: 'Mount MNIST' condition: succeededOrFailed() -- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data" +- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" displayName: 'Mount bert-data' condition: succeededOrFailed() -- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" +- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" displayName: 'Mount hf-models-cache' condition: succeededOrFailed() diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml index 63e75c176a..fa551da5c8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml @@ -92,15 +92,15 @@ stages: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64/dyninst:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib/dyninst:/usr/local/lib64 Repository: onnxruntimetraininggpubuild - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist" + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" displayName: 'Mount MNIST' condition: succeededOrFailed() - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data" + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" displayName: 'Mount bert-data' condition: succeededOrFailed() - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" displayName: 'Mount hf-models-cache' condition: succeededOrFailed() diff --git a/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh b/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh index 511d549fad..8a09f86f00 100755 --- a/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh +++ b/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh @@ -5,19 +5,19 @@ function credentialize () { sudo mkdir /etc/smbcredentials fi - if [ -f "/etc/smbcredentials/orttrainingtestdata.cred" ]; then - sudo rm /etc/smbcredentials/orttrainingtestdata.cred + if [ -f "/etc/smbcredentials/orttrainingtestdatascus.cred" ]; then + sudo rm /etc/smbcredentials/orttrainingtestdatascus.cred fi - # to create orttrainingtestdata.cred, I have to do: 'sudo bash -c ...' - sudo bash -c 'echo "username=orttrainingtestdata" >> /etc/smbcredentials/orttrainingtestdata.cred' + # to create orttrainingtestdatascus.cred, I have to do: 'sudo bash -c ...' + sudo bash -c 'echo "username=orttrainingtestdatascus" >> /etc/smbcredentials/orttrainingtestdatascus.cred' # $1 get removed (do defend injection attack?) if I do 'sudo bash -c...' # to enable 'sudo echo...' I need to 'sudo chmod 777...' first. - sudo chmod 777 /etc/smbcredentials/orttrainingtestdata.cred - sudo echo "password=$1" >> /etc/smbcredentials/orttrainingtestdata.cred + sudo chmod 777 /etc/smbcredentials/orttrainingtestdatascus.cred + sudo echo "password=$1" >> /etc/smbcredentials/orttrainingtestdatascus.cred - sudo chmod 600 /etc/smbcredentials/orttrainingtestdata.cred + sudo chmod 600 /etc/smbcredentials/orttrainingtestdatascus.cred } function mount_data () { @@ -36,8 +36,8 @@ function mount_data () { sudo mkdir -p $2 - sudo bash -c 'echo "$1 $2 cifs nofail,vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdata.cred,dir_mode=0777,file_mode=0777,serverino" >> /etc/fstab' -- $1 $2 - sudo mount -t cifs $1 $2 -o vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdata.cred,dir_mode=0777,file_mode=0777,serverino + sudo bash -c 'echo "$1 $2 cifs nofail,vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdatascus.cred,dir_mode=0777,file_mode=0777,serverino" >> /etc/fstab' -- $1 $2 + sudo mount -t cifs $1 $2 -o vers=3.0,credentials=/etc/smbcredentials/orttrainingtestdatascus.cred,dir_mode=0777,file_mode=0777,serverino } while getopts "p:s:d:" opt; do