diff --git a/orttraining/orttraining/test/python/orttraining_on_device_training_tests.py b/orttraining/orttraining/test/python/orttraining_on_device_training_tests.py new file mode 100644 index 0000000000..00e3660740 --- /dev/null +++ b/orttraining/orttraining/test/python/orttraining_on_device_training_tests.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# This file contains calls to the tests for on device training offline tooling +# and training apis. The tests are run in a separate process to avoid +# testing the entire ort suite of tests yet again (since they are covered in +# other pipelines) using the gtest filter. + +import argparse +import logging +import os +import sys + +from _test_commons import run_subprocess + +logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG) +log = logging.getLogger("OnDeviceTrainingTests") + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--cwd", help="Path to the current working directory") + return parser.parse_args() + + +def run_onnxblock_tests(cwd, log): + """Runs the offline tooling tests for on-device training.""" + + log.debug("Running: onnxblock tests") + + command = [sys.executable, "-m", "pytest", "-sv", "orttraining_test_onnxblock.py"] + + run_subprocess(command, cwd=cwd, log=log).check_returncode() + + +def run_onnxruntime_test_all_ctest(cwd, log, filter): + """Calls onnxruntime_test_all gtest executable with the given filter.""" + + command = [os.path.join(cwd, "onnxruntime_test_all"), f"--gtest_filter={filter}"] + + run_subprocess(command, cwd=cwd, log=log).check_returncode() + + +def run_training_api_tests(cwd, log): + """Runs the onnxruntime_test_all executable with the TrainingApiTest* gtest filter.""" + + log.debug("Running: TrainingApi tests") + + run_onnxruntime_test_all_ctest(cwd, log, "TrainingApiTest*") + + +def run_checkpoint_api_tests(cwd, log): + """Runs the onnxruntime_test_all executable with the CheckpointApiTest* gtest filter.""" + + log.debug("Running: TrainingApi tests") + + run_onnxruntime_test_all_ctest(cwd, log, "CheckpointApiTest*") + + +def main(): + args = parse_arguments() + cwd = args.cwd + + log.info("Running ortmodule tests pipeline") + + run_onnxblock_tests(cwd, log) + + run_training_api_tests(cwd, log) + + run_checkpoint_api_tests(cwd, log) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/orttraining/orttraining/test/python/orttraining_test_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_onnxblock.py index d06c48aa2c..e66d7ada2c 100644 --- a/orttraining/orttraining/test/python/orttraining_test_onnxblock.py +++ b/orttraining/orttraining/test/python/orttraining_test_onnxblock.py @@ -338,10 +338,6 @@ def test_mse_loss_training_graph_execution(): # assert loss is close assert np.allclose(ort_outs[0], _to_numpy(torch_outs)) - # assert all the gradients are close - for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()): - assert np.allclose(ort_grad, _to_numpy(pt_param.grad)) - def test_crossentropy_loss_training_graph_execution(): # Given @@ -377,10 +373,6 @@ def test_crossentropy_loss_training_graph_execution(): # assert loss is close assert np.allclose(ort_outs[0], _to_numpy(torch_outs)) - # assert all the gradients are close - for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()): - assert np.allclose(ort_grad, _to_numpy(pt_param.grad)) - def test_bcewithlogits_loss_training_graph_execution(): # Given @@ -416,10 +408,6 @@ def test_bcewithlogits_loss_training_graph_execution(): # assert loss is close assert np.allclose(ort_outs[0], _to_numpy(torch_outs)) - # assert all the gradients are close - for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()): - assert np.allclose(ort_grad, _to_numpy(pt_param.grad)) - @pytest.mark.parametrize( "graph", diff --git a/orttraining/orttraining/test/training_api/core/checkpoint_test.cc b/orttraining/orttraining/test/training_api/core/checkpoint_test.cc index 4f9405bc6b..88a87a2e3e 100644 --- a/orttraining/orttraining/test/training_api/core/checkpoint_test.cc +++ b/orttraining/orttraining/test/training_api/core/checkpoint_test.cc @@ -172,7 +172,7 @@ TEST(CheckpointApiTest, SaveOptimizerStateAsCheckpoint_ThenLoad_CUDA) { /// Phase 1 - Test Preparison /// Prepare the data and dest folder for saving checkpoint. /// Also cooked the data for test result comparison. - auto model_uri = MODEL_FOLDER "training_api/gradient_graph.onnx"; + auto model_uri = MODEL_FOLDER "training_api/training_model.onnx"; auto optim_uri = MODEL_FOLDER "training_api/adamw.onnx"; // Generate randomized weight values using synthetic data generator. diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-on-device-training.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-on-device-training.yml new file mode 100644 index 0000000000..8e06c0de43 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-on-device-training.yml @@ -0,0 +1,48 @@ +trigger: none + +jobs: +- job: Onnxruntime_Linux_GPU_OnDeviceTraining + + timeoutInMinutes: 120 + pool: 'Onnxruntime-Linux-GPU-NC6sv3' + + steps: + - checkout: self + clean: true + submodules: recursive + + - template: templates/run-docker-build-steps.yml + parameters: + RunDockerBuildArgs: | + -o ubuntu20.04 -d gpu -e \ + -t onnxruntime_ondevicetraining_tests_image \ + -x " \ + --config RelWithDebInfo \ + --enable_training \ + --enable_training_on_device \ + --use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3 \ + --build_wheel \ + --skip_tests \ + " \ + -u + DisplayName: 'Build onnxruntime' + + # Entry point for all on device training tests + - script: | + docker run \ + --gpus all \ + --shm-size=1024m \ + --rm \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + onnxruntime_ondevicetraining_tests_image \ + /build/RelWithDebInfo/launch_test.py \ + --cwd /build/RelWithDebInfo --cmd_line_with_args "python orttraining_on_device_training_tests.py --cwd /build/RelWithDebInfo" \ + displayName: 'Run On-Device Training Tests' + condition: succeededOrFailed() + timeoutInMinutes: 120 + - template: templates/component-governance-component-detection-steps.yml + parameters: + condition: 'succeeded' + + - template: templates/clean-agent-build-directory-step.yml