diff --git a/orttraining/orttraining/test/python/orttraining_on_device_training_tests.py b/orttraining/orttraining/test/python/orttraining_on_device_training_tests.py
new file mode 100644
index 0000000000..00e3660740
--- /dev/null
+++ b/orttraining/orttraining/test/python/orttraining_on_device_training_tests.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# This file contains calls to the tests for on device training offline tooling
+# and training apis. The tests are run in a separate process to avoid
+# testing the entire ort suite of tests yet again (since they are covered in
+# other pipelines) using the gtest filter.
+
+import argparse
+import logging
+import os
+import sys
+
+from _test_commons import run_subprocess
+
+logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG)
+log = logging.getLogger("OnDeviceTrainingTests")
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--cwd", help="Path to the current working directory")
+    return parser.parse_args()
+
+
+def run_onnxblock_tests(cwd, log):
+    """Runs the offline tooling tests for on-device training."""
+
+    log.debug("Running: onnxblock tests")
+
+    command = [sys.executable, "-m", "pytest", "-sv", "orttraining_test_onnxblock.py"]
+
+    run_subprocess(command, cwd=cwd, log=log).check_returncode()
+
+
+def run_onnxruntime_test_all_ctest(cwd, log, filter):
+    """Calls onnxruntime_test_all gtest executable with the given filter."""
+
+    command = [os.path.join(cwd, "onnxruntime_test_all"), f"--gtest_filter={filter}"]
+
+    run_subprocess(command, cwd=cwd, log=log).check_returncode()
+
+
+def run_training_api_tests(cwd, log):
+    """Runs the onnxruntime_test_all executable with the TrainingApiTest* gtest filter."""
+
+    log.debug("Running: TrainingApi tests")
+
+    run_onnxruntime_test_all_ctest(cwd, log, "TrainingApiTest*")
+
+
+def run_checkpoint_api_tests(cwd, log):
+    """Runs the onnxruntime_test_all executable with the CheckpointApiTest* gtest filter."""
+
+    log.debug("Running: TrainingApi tests")
+
+    run_onnxruntime_test_all_ctest(cwd, log, "CheckpointApiTest*")
+
+
+def main():
+    args = parse_arguments()
+    cwd = args.cwd
+
+    log.info("Running ortmodule tests pipeline")
+
+    run_onnxblock_tests(cwd, log)
+
+    run_training_api_tests(cwd, log)
+
+    run_checkpoint_api_tests(cwd, log)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/orttraining/orttraining/test/python/orttraining_test_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_onnxblock.py
index d06c48aa2c..e66d7ada2c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_onnxblock.py
+++ b/orttraining/orttraining/test/python/orttraining_test_onnxblock.py
@@ -338,10 +338,6 @@ def test_mse_loss_training_graph_execution():
         # assert loss is close
         assert np.allclose(ort_outs[0], _to_numpy(torch_outs))
 
-        # assert all the gradients are close
-        for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()):
-            assert np.allclose(ort_grad, _to_numpy(pt_param.grad))
-
 
 def test_crossentropy_loss_training_graph_execution():
     # Given
@@ -377,10 +373,6 @@ def test_crossentropy_loss_training_graph_execution():
         # assert loss is close
         assert np.allclose(ort_outs[0], _to_numpy(torch_outs))
 
-        # assert all the gradients are close
-        for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()):
-            assert np.allclose(ort_grad, _to_numpy(pt_param.grad))
-
 
 def test_bcewithlogits_loss_training_graph_execution():
     # Given
@@ -416,10 +408,6 @@ def test_bcewithlogits_loss_training_graph_execution():
         # assert loss is close
         assert np.allclose(ort_outs[0], _to_numpy(torch_outs))
 
-        # assert all the gradients are close
-        for ort_grad, pt_param in zip(ort_outs[1:], pt_model.parameters()):
-            assert np.allclose(ort_grad, _to_numpy(pt_param.grad))
-
 
 @pytest.mark.parametrize(
     "graph",
diff --git a/orttraining/orttraining/test/training_api/core/checkpoint_test.cc b/orttraining/orttraining/test/training_api/core/checkpoint_test.cc
index 4f9405bc6b..88a87a2e3e 100644
--- a/orttraining/orttraining/test/training_api/core/checkpoint_test.cc
+++ b/orttraining/orttraining/test/training_api/core/checkpoint_test.cc
@@ -172,7 +172,7 @@ TEST(CheckpointApiTest, SaveOptimizerStateAsCheckpoint_ThenLoad_CUDA) {
   /// Phase 1 - Test Preparison
   /// Prepare the data and dest folder for saving checkpoint.
   /// Also cooked the data for test result comparison.
-  auto model_uri = MODEL_FOLDER "training_api/gradient_graph.onnx";
+  auto model_uri = MODEL_FOLDER "training_api/training_model.onnx";
   auto optim_uri = MODEL_FOLDER "training_api/adamw.onnx";
 
   // Generate randomized weight values using synthetic data generator.
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-on-device-training.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-on-device-training.yml
new file mode 100644
index 0000000000..8e06c0de43
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-on-device-training.yml
@@ -0,0 +1,48 @@
+trigger: none
+
+jobs:
+- job: Onnxruntime_Linux_GPU_OnDeviceTraining
+
+  timeoutInMinutes: 120
+  pool: 'Onnxruntime-Linux-GPU-NC6sv3'
+
+  steps:
+  - checkout: self
+    clean: true
+    submodules: recursive
+
+  - template: templates/run-docker-build-steps.yml
+    parameters:
+      RunDockerBuildArgs: |
+        -o ubuntu20.04 -d gpu -e \
+        -t onnxruntime_ondevicetraining_tests_image \
+        -x " \
+          --config RelWithDebInfo \
+          --enable_training \
+          --enable_training_on_device \
+          --use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3 \
+          --build_wheel \
+          --skip_tests \
+          " \
+        -u
+      DisplayName: 'Build onnxruntime'
+
+  # Entry point for all on device training tests
+  - script: |
+      docker run \
+        --gpus all \
+        --shm-size=1024m \
+        --rm \
+        --volume $(Build.SourcesDirectory):/onnxruntime_src \
+        --volume $(Build.BinariesDirectory):/build \
+        onnxruntime_ondevicetraining_tests_image \
+          /build/RelWithDebInfo/launch_test.py \
+            --cwd /build/RelWithDebInfo --cmd_line_with_args "python orttraining_on_device_training_tests.py --cwd /build/RelWithDebInfo" \
+    displayName: 'Run On-Device Training Tests'
+    condition: succeededOrFailed()
+    timeoutInMinutes: 120
+  - template: templates/component-governance-component-detection-steps.yml
+    parameters:
+      condition: 'succeeded'
+
+  - template: templates/clean-agent-build-directory-step.yml