Move Linux CPU pipelines to an AMD CPU pool which is cheaper (#15144)

### Description
1. Move Linux CPU pipelines to an AMD CPU pool which is cheaper
2. Enable CCache for orttraining pipeline

### Motivation and Context
Azure AMD CPU machines are generally much cheaper than Intel CPU
machines. However, they don't have local disks.
This commit is contained in:
Changming Sun 2023-03-27 14:10:08 -07:00 committed by GitHub
parent 67a6022c03
commit 63cc1bb26a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 75 additions and 72 deletions

View file

@ -105,8 +105,8 @@ class OrtOpTests(unittest.TestCase):
pt_y1, pt_y2 = model(x)
session = ort.InferenceSession(exported_model.SerializeToString(), providers=["CPUExecutionProvider"])
ort_y1, ort_y2 = session.run([], {"x": x.numpy()})
np.testing.assert_almost_equal(ort_y1, pt_y1.detach().numpy())
np.testing.assert_almost_equal(ort_y2, pt_y2.detach().numpy())
np.testing.assert_almost_equal(ort_y1, pt_y1.detach().numpy(), decimal=6)
np.testing.assert_almost_equal(ort_y2, pt_y2.detach().numpy(), decimal=6)
# Run w/ IO binding.
for _ in range(8):
@ -123,8 +123,8 @@ class OrtOpTests(unittest.TestCase):
io_binding.bind_ortvalue_output(exported_model.graph.output[0].name, ort_y1)
io_binding.bind_ortvalue_output(exported_model.graph.output[1].name, ort_y2)
session.run_with_iobinding(io_binding)
np.testing.assert_almost_equal(np_y1, pt_y1.detach().numpy())
np.testing.assert_almost_equal(np_y2, pt_y2.detach().numpy())
np.testing.assert_almost_equal(np_y1, pt_y1.detach().numpy(), decimal=6)
np.testing.assert_almost_equal(np_y2, pt_y2.detach().numpy(), decimal=6)
if __name__ == "__main__":

View file

@ -20,7 +20,7 @@ stages:
- stage: BUILD_STAGE
jobs:
- job: Build_CPU_EP
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
workspace:
clean: all
timeoutInMinutes: 30
@ -105,7 +105,7 @@ stages:
- template: templates/clean-agent-build-directory-step.yml
- job: Build_NNAPI_EP
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
timeoutInMinutes: 60
workspace:
clean: all
@ -452,7 +452,7 @@ stages:
variables:
- name: skipComponentGovernanceDetection
value: true
pool: 'Linux-CPU-2019'
pool: 'onnxruntime-Linux-CPU-2019'
condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI'))
dependsOn:
- NNAPI_EP_MASTER

View file

@ -26,8 +26,7 @@ jobs:
timeoutInMinutes: 60
workspace:
clean: all
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
strategy:
# Notes:
# - BuildConfigFile path is relative to repository root.

View file

@ -10,14 +10,14 @@ stages:
dependsOn: []
jobs:
- job: Linux_Build
timeoutInMinutes: 120
timeoutInMinutes: 180
workspace:
clean: all
variables:
skipComponentGovernanceDetection: true
CCACHE_DIR: $(Pipeline.Workspace)/ccache
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
@ -53,16 +53,12 @@ stages:
"$(TODAY)" |
displayName: Cach Task
- script: |
sudo mkdir -p $(Pipeline.Workspace)/ccache
condition: ne(variables.CACHE_RESTORED, 'true')
displayName: Create Cache Dir
- task: CmdLine@2
displayName: 'build'
inputs:
script: |
mkdir -p $HOME/.onnx
mkdir -p $(Pipeline.Workspace)/ccache
docker run --rm \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \

View file

@ -14,7 +14,7 @@ jobs:
variables:
CCACHE_DIR: $(Pipeline.Workspace)/ccache
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'

View file

@ -11,7 +11,7 @@ jobs:
timeoutInMinutes: 120
workspace:
clean: all
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
steps:
- checkout: self
clean: true
@ -87,7 +87,7 @@ jobs:
timeoutInMinutes: 120
workspace:
clean: all
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
steps:
- checkout: self
clean: true

View file

@ -34,8 +34,7 @@ jobs:
timeoutInMinutes: 120
workspace:
clean: all
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
variables:
test_data_directory: $(Build.SourcesDirectory)/.test_data

View file

@ -14,7 +14,7 @@ jobs:
CCACHE_DIR: $(Pipeline.Workspace)/ccache
workspace:
clean: all
pool: Linux-CPU-2019
pool: onnxruntime-Linux-CPU-2019
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'

View file

@ -32,4 +32,4 @@ jobs:
parameters:
NpmPackagingMode: ${{ variables.NpmPackagingMode }}
BuildConfig: 'Release'
PoolName: 'Linux-CPU-2019'
PoolName: 'onnxruntime-Linux-CPU-2019'

View file

@ -1,6 +1,6 @@
resources:
repositories:
- repository: manylinux
- repository: manylinux # The name used to reference this repository in the checkout step
type: Github
endpoint: Microsoft
name: pypa/manylinux
@ -8,16 +8,22 @@ resources:
jobs:
- job: Linux_Build
timeoutInMinutes: 150
variables:
skipComponentGovernanceDetection: true
timeoutInMinutes: 180
workspace:
clean: all
pool: Linux-CPU-2019
variables:
skipComponentGovernanceDetection: true
CCACHE_DIR: $(Pipeline.Workspace)/ccache
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
pool: onnxruntime-Ubuntu-2004-Training-CPU
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()
- checkout: self
clean: true
submodules: recursive
submodules: none
- task: NodeTool@0
inputs:
@ -35,29 +41,48 @@ jobs:
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimecpubuild
- task: Cache@2
inputs:
key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
path: $(CCACHE_DIR)
cacheHitVar: CACHE_RESTORED
restoreKeys: |
"$(TODAY)" | "$(Build.SourceBranch)"
"$(TODAY)" |
displayName: Cach Task
- task: CmdLine@2
displayName: 'build'
inputs:
script: |
mkdir -p $HOME/.onnx
mkdir -p $(Pipeline.Workspace)/ccache
docker run --rm \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
--volume $(Pipeline.Workspace)/ccache:/cache \
-e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
-e CCACHE_DIR=/cache \
onnxruntimecpubuild \
/bin/bash -c "
set -ex; \
ccache -s; \
/opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
--build_dir /build --cmake_generator Ninja \
--config Debug Release \
--config Release \
--skip_submodule_sync \
--build_shared_lib \
--parallel \
--build_wheel --enable_training \
--build_wheel \
--enable_onnx_tests \
--build_java --build_nodejs --update --build
--enable_training \
--use_cache \
--build_java --build_nodejs --update --build; \
ccache -s"
workingDirectory: $(Build.SourcesDirectory)
- task: CmdLine@2
@ -69,13 +94,10 @@ jobs:
cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
# Test ORT with the latest ONNX release.
sed -i "s/git+http:\/\/github\.com\/onnx\/onnx.*/onnx/" $(Build.BinariesDirectory)/requirements.txt
#Do not explicitly specify numpy version as this is not a packaging pipeline, any version should be ok
sed -i "/^numpy/d" $(Build.BinariesDirectory)/requirements.txt
python3 -m pip install -r $(Build.BinariesDirectory)/requirements.txt
mkdir $(Build.BinariesDirectory)/requirements_torch_cpu
mkdir $(Build.BinariesDirectory)/requirements_torch_cpu/
cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $(Build.BinariesDirectory)/requirements_torch_cpu/requirements.txt
python3 -m pip install -r $(Build.BinariesDirectory)/requirements_torch_cpu/requirements.txt
ln -sf /data/models $(Build.BinariesDirectory)
cd $(Build.SourcesDirectory)/java
$(Build.SourcesDirectory)/java/gradlew "cmakeCheck" "-DcmakeBuildDir=$(Build.BinariesDirectory)/Release"
@ -86,35 +108,24 @@ jobs:
rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
python3 -m pip install $(Build.BinariesDirectory)/Release/dist/*.whl
- task: PythonScript@0
displayName: 'Run Release unit tests'
inputs:
scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
workingDirectory: $(Build.BinariesDirectory)/Release
arguments: --build_dir $(Build.BinariesDirectory) --cmake_generator Ninja --config Release --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_training --enable_onnx_tests --build_nodejs --ctest_path ""
- task: CmdLine@2
displayName: 'Install Debug python package'
inputs:
script: |
set -e -x
rm -rf $(Build.BinariesDirectory)/Debug/onnxruntime $(Build.BinariesDirectory)/Debug/pybind11
python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
python3 -m pip install $(Build.BinariesDirectory)/Debug/dist/*.whl
- task: PythonScript@0
displayName: 'Run Debug unit tests'
inputs:
scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
arguments: --build_dir $(Build.BinariesDirectory) --cmake_generator Ninja --config Debug --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_training --enable_onnx_tests --build_nodejs --ctest_path ""
workingDirectory: $(Build.BinariesDirectory)/Debug
- task: PythonScript@0
displayName: 'Symbolic shape infer'
inputs:
scriptPath: $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py
workingDirectory: $(Build.BinariesDirectory)/Release
arguments: >-
--build_dir $(Build.BinariesDirectory)
--cmake_generator Ninja
--config Release
--test
--skip_submodule_sync
--build_shared_lib
--parallel
--build_wheel
--enable_onnx_tests
--enable_training
--build_nodejs
--ctest_path ""
- task: PublishTestResults@2
displayName: 'Publish unit test results'
@ -122,6 +133,4 @@ jobs:
testResultsFiles: '**/*.results.xml'
searchFolder: '$(Build.BinariesDirectory)'
testRunTitle: 'Unit Test Run'
condition: succeededOrFailed()
- template: templates/clean-agent-build-directory-step.yml
condition: succeededOrFailed()

View file

@ -4,7 +4,7 @@ jobs:
- job: Onnxruntime_Linux_GPU_ORTTraining_Clear_Cache
timeoutInMinutes: 15
pool: 'Linux-CPU-2019'
pool: 'onnxruntime-Linux-CPU-2019'
steps:
- checkout: self

View file

@ -25,7 +25,7 @@ stages:
timeoutInMinutes: 120
workspace:
clean: all
pool: Linux-CPU
pool: aiinfra-Linux-CPU
strategy:
matrix:

View file

@ -766,7 +766,7 @@ stages:
- template: ../nuget/templates/test_linux.yml
parameters:
AgentPool : Linux-CPU
AgentPool : aiinfra-Linux-CPU
NugetPackageName : 'Microsoft.ML.OnnxRuntime'
ArtifactSuffix: 'CPU'

View file

@ -1,5 +1,5 @@
parameters:
AgentPool : 'Linux-CPU-2019'
AgentPool : 'onnxruntime-Linux-CPU-2019'
StageName : 'Linux_CI_Dev'
SubmoduleCheckoutMode: ''
RunDockerBuildArgs: '-o ubuntu20.04 -d cpu -x "--build_wheel"'

View file

@ -48,7 +48,7 @@ stages:
timeoutInMinutes: 90
workspace:
clean: all
pool: Linux-CPU
pool: aiinfra-Linux-CPU
strategy:
matrix:
${{ each PythonVersion in parameters.python_version }}:

View file

@ -10,7 +10,7 @@ parameters:
- name: PoolName
displayName: 'Agent pool name'
type: string
default: 'Win-CPU-2019'
default: 'onnxruntime-Win-CPU-2019'
- name: PackageName
displayName: 'Package name'
type: string

View file

@ -13,7 +13,7 @@ parameters:
- name: PoolName
type: string
default: 'Win-CPU-2019'
default: 'onnxruntime-Win-CPU-2019'
- name: SkipPublish
type: boolean

View file

@ -13,7 +13,7 @@ parameters:
- name: PoolName
type: string
default: 'Win-CPU-2019'
default: 'onnxruntime-Win-CPU-2019'
- name: PackageName
displayName: 'Package name'

View file

@ -28,5 +28,5 @@ stages:
parameters:
NpmPackagingMode: ${{ variables.NpmPackagingMode }}
IsReleasePipeline: false
PoolName: 'Win-CPU-2019'
PoolName: 'onnxruntime-Win-CPU-2019'
BuildStaticLib: true