mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-04 04:07:22 +00:00
On device training CI pipeline (#11987)
This commit is contained in:
parent
8d0e86dec8
commit
ddb45e9126
4 changed files with 126 additions and 13 deletions
|
|
@ -0,0 +1,77 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# This file contains calls to the tests for on device training offline tooling
|
||||
# and training apis. The tests are run in a separate process to avoid
|
||||
# testing the entire ort suite of tests yet again (since they are covered in
|
||||
# other pipelines) using the gtest filter.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
from _test_commons import run_subprocess
|
||||
|
||||
logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG)
|
||||
log = logging.getLogger("OnDeviceTrainingTests")
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--cwd", help="Path to the current working directory")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def run_onnxblock_tests(cwd, log):
|
||||
"""Runs the offline tooling tests for on-device training."""
|
||||
|
||||
log.debug("Running: onnxblock tests")
|
||||
|
||||
command = [sys.executable, "-m", "pytest", "-sv", "orttraining_test_onnxblock.py"]
|
||||
|
||||
run_subprocess(command, cwd=cwd, log=log).check_returncode()
|
||||
|
||||
|
||||
def run_onnxruntime_test_all_ctest(cwd, log, filter):
|
||||
"""Calls onnxruntime_test_all gtest executable with the given filter."""
|
||||
|
||||
command = [os.path.join(cwd, "onnxruntime_test_all"), f"--gtest_filter={filter}"]
|
||||
|
||||
run_subprocess(command, cwd=cwd, log=log).check_returncode()
|
||||
|
||||
|
||||
def run_training_api_tests(cwd, log):
|
||||
"""Runs the onnxruntime_test_all executable with the TrainingApiTest* gtest filter."""
|
||||
|
||||
log.debug("Running: TrainingApi tests")
|
||||
|
||||
run_onnxruntime_test_all_ctest(cwd, log, "TrainingApiTest*")
|
||||
|
||||
|
||||
def run_checkpoint_api_tests(cwd, log):
|
||||
"""Runs the onnxruntime_test_all executable with the CheckpointApiTest* gtest filter."""
|
||||
|
||||
log.debug("Running: TrainingApi tests")
|
||||
|
||||
run_onnxruntime_test_all_ctest(cwd, log, "CheckpointApiTest*")
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
cwd = args.cwd
|
||||
|
||||
log.info("Running ortmodule tests pipeline")
|
||||
|
||||
run_onnxblock_tests(cwd, log)
|
||||
|
||||
run_training_api_tests(cwd, log)
|
||||
|
||||
run_checkpoint_api_tests(cwd, log)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -338,10 +338,6 @@ def test_mse_loss_training_graph_execution():
|
|||
# assert loss is close
|
||||
assert np.allclose(ort_outs[0], _to_numpy(torch_outs))
|
||||
|
||||
# assert all the gradients are close
|
||||
for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()):
|
||||
assert np.allclose(ort_grad, _to_numpy(pt_param.grad))
|
||||
|
||||
|
||||
def test_crossentropy_loss_training_graph_execution():
|
||||
# Given
|
||||
|
|
@ -377,10 +373,6 @@ def test_crossentropy_loss_training_graph_execution():
|
|||
# assert loss is close
|
||||
assert np.allclose(ort_outs[0], _to_numpy(torch_outs))
|
||||
|
||||
# assert all the gradients are close
|
||||
for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()):
|
||||
assert np.allclose(ort_grad, _to_numpy(pt_param.grad))
|
||||
|
||||
|
||||
def test_bcewithlogits_loss_training_graph_execution():
|
||||
# Given
|
||||
|
|
@ -416,10 +408,6 @@ def test_bcewithlogits_loss_training_graph_execution():
|
|||
# assert loss is close
|
||||
assert np.allclose(ort_outs[0], _to_numpy(torch_outs))
|
||||
|
||||
# assert all the gradients are close
|
||||
for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()):
|
||||
assert np.allclose(ort_grad, _to_numpy(pt_param.grad))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"graph",
|
||||
|
|
|
|||
|
|
@ -172,7 +172,7 @@ TEST(CheckpointApiTest, SaveOptimizerStateAsCheckpoint_ThenLoad_CUDA) {
|
|||
/// Phase 1 - Test Preparison
|
||||
/// Prepare the data and dest folder for saving checkpoint.
|
||||
/// Also cooked the data for test result comparison.
|
||||
auto model_uri = MODEL_FOLDER "training_api/gradient_graph.onnx";
|
||||
auto model_uri = MODEL_FOLDER "training_api/training_model.onnx";
|
||||
auto optim_uri = MODEL_FOLDER "training_api/adamw.onnx";
|
||||
|
||||
// Generate randomized weight values using synthetic data generator.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
trigger: none
|
||||
|
||||
jobs:
|
||||
- job: Onnxruntime_Linux_GPU_OnDeviceTraining
|
||||
|
||||
timeoutInMinutes: 120
|
||||
pool: 'Onnxruntime-Linux-GPU-NC6sv3'
|
||||
|
||||
steps:
|
||||
- checkout: self
|
||||
clean: true
|
||||
submodules: recursive
|
||||
|
||||
- template: templates/run-docker-build-steps.yml
|
||||
parameters:
|
||||
RunDockerBuildArgs: |
|
||||
-o ubuntu20.04 -d gpu -e \
|
||||
-t onnxruntime_ondevicetraining_tests_image \
|
||||
-x " \
|
||||
--config RelWithDebInfo \
|
||||
--enable_training \
|
||||
--enable_training_on_device \
|
||||
--use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3 \
|
||||
--build_wheel \
|
||||
--skip_tests \
|
||||
" \
|
||||
-u
|
||||
DisplayName: 'Build onnxruntime'
|
||||
|
||||
# Entry point for all on device training tests
|
||||
- script: |
|
||||
docker run \
|
||||
--gpus all \
|
||||
--shm-size=1024m \
|
||||
--rm \
|
||||
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
||||
--volume $(Build.BinariesDirectory):/build \
|
||||
onnxruntime_ondevicetraining_tests_image \
|
||||
/build/RelWithDebInfo/launch_test.py \
|
||||
--cwd /build/RelWithDebInfo --cmd_line_with_args "python orttraining_on_device_training_tests.py --cwd /build/RelWithDebInfo" \
|
||||
displayName: 'Run On-Device Training Tests'
|
||||
condition: succeededOrFailed()
|
||||
timeoutInMinutes: 120
|
||||
- template: templates/component-governance-component-detection-steps.yml
|
||||
parameters:
|
||||
condition: 'succeeded'
|
||||
|
||||
- template: templates/clean-agent-build-directory-step.yml
|
||||
Loading…
Reference in a new issue