diff --git a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
index 23b17a078c..5a88bc497e 100644
--- a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
+++ b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
@@ -70,11 +70,13 @@ def bert_model_description(config):
             ('attention_mask', ['batch', 'max_seq_len_in_batch'],),
             ('token_type_ids', ['batch', 'max_seq_len_in_batch'],),
             ('masked_lm_labels', ['batch', 'max_seq_len_in_batch'],),
-            ('next_sentence_label', ['batch', ],)],
+            ('next_sentence_label', ['batch', ],)
+            ],
         'outputs': [
             ('loss', [], True),
             ('prediction_scores', ['batch', 'max_seq_len_in_batch', vocab_size],),
-            ('seq_relationship_scores', ['batch', 2],)]}
+            ('seq_relationship_scores', ['batch', 2],)
+            ]}
     return new_model_desc
 
 
@@ -119,6 +121,47 @@ class pretraining_dataset(Dataset):
         return [input_ids, segment_ids, input_mask,
                 masked_lm_labels, next_sentence_labels]
 
+import argparse
+def parse_arguments():
+
+    parser = argparse.ArgumentParser()
+
+    # batch size test config parameters
+    parser.add_argument("--enable_mixed_precision",
+                        default=False,
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    
+    parser.add_argument("--sequence_length",
+                        default=512,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--max_predictions_per_seq",
+                        default=80,
+                        type=int,
+                        help="The maximum total of masked tokens in input sequence")
+    parser.add_argument("--max_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+
+    parser.add_argument("--gelu_recompute",
+                        default=False,
+                        action='store_true')
+
+    parser.add_argument("--attn_dropout_recompute",
+                        default=False,
+                        action='store_true')
+
+    parser.add_argument("--transformer_layer_recompute",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
 @dataclass
 class PretrainArguments:
     """
@@ -207,6 +250,19 @@ class PretrainArguments:
         metadata={"help": "Whether to use 16-bit float precision instead of 32-bit."}
     )
 
+    gelu_recompute: bool = field(
+        default=False,
+        metadata={"help": "Whether to enable recomputing Gelu activation output to save memory."}
+    )
+    attn_dropout_recompute: bool = field(
+        default=False,
+        metadata={"help": "Whether to enable recomputing attention dropout to save memory."}
+    )
+    transformer_layer_recompute: bool = field(
+        default=False,
+        metadata={"help": "Whether to enable recomputing transformer layerwise to save memory."}
+    )
+
     loss_scale: Optional[float] = field(
         default=0.0,
         metadata={"help": "Loss scaling, positive power of 2 values can improve fp16 convergence."}
@@ -345,8 +401,8 @@ def setup_torch_distributed(world_rank, world_size):
     return
 
 def prepare_model(args, device):
-    config = BertConfig.from_pretrained('bert-base-uncased', cache_dir=args.cache_dir)
-    
+    config = BertConfig.from_pretrained(args.bert_model, cache_dir=args.cache_dir)
+
     # config.num_hidden_layers = 12
     if args.force_num_hidden_layers:
         logger.info("Modifying model config with num_hidden_layers to %d", args.force_num_hidden_layers)
@@ -367,6 +423,11 @@ def prepare_model(args, device):
                                             'mixed_precision': {
                                                 'enabled': args.fp16,
                                                 'loss_scaler': loss_scaler},
+                                            'graph_transformer': {
+                                                'attn_dropout_recompute': args.attn_dropout_recompute,
+                                                'gelu_recompute': args.gelu_recompute,
+                                                'transformer_layer_recompute': args.transformer_layer_recompute,
+                                            },
                                             'debug': {'deterministic_compute': True, },
                                             'utils': {
                                                 'grad_norm_clip': True},
@@ -524,41 +585,41 @@ class ORTBertPretrainTest(unittest.TestCase):
         self.allreduce_post_accumulation = True
         self.tensorboard_dir = '/bert_data/hf_data/test_out'
 
-    def test_pretrain_throughput(self):
-        # setting train_batch_size and gradient_accumulation_steps to maximize per gpu memory usage under 16GB
-        # to validate throughput regression.
-        # train_batch_size is initially configured as per optimization batch size,
-        # taking into consideration of world_size and gradient_accumulation_steps:
-        # train_batch_size = world_size * gradient_accumulation_steps * batch_size_per_gpu
-        # in the code later train_batch_size is translated to per gpu batch size:
-        # args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps // args.world_size
+    def test_pretrain_throughput(self, process_args=None):
+        if process_args.sequence_length == 128:
+            input_dir = '/bert_data/hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus/train'
+        else:
+            input_dir = '/bert_data/hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus/train'
 
-        # the LAMB batch size of 64k
-        optimization_batch_size = 64 * 1024
-        per_gpu_batch_size = 32
+        print("process_args.enable_mixed_precision: ", process_args.enable_mixed_precision)
+        print("process_args.sequence_length: ", process_args.sequence_length)
+        print("process_args.max_batch_size: ", process_args.max_batch_size)
+        print("process_args.max_predictions_per_seq: ", process_args.max_predictions_per_seq)
+        print("process_args.gelu_recompute: ", process_args.gelu_recompute)
+        print("process_args.attn_dropout_recompute: ", process_args.attn_dropout_recompute)
+        print("process_args.transformer_layer_recompute: ", process_args.transformer_layer_recompute)
 
-        self.train_batch_size = optimization_batch_size
-        self.gradient_accumulation_steps = optimization_batch_size // per_gpu_batch_size // self.world_size
-
-        logger.info("self.gradient_accumulation_steps = %d", self.gradient_accumulation_steps)
-
-        # only to run on  optimization step because we only want to make sure there is no throughput regression
-        self.max_steps = 1
         args = PretrainArguments(
-            output_dir=self.output_dir,
-            bert_model=self.bert_model,
+            input_dir=input_dir,
+            output_dir='/bert_data/hf_data/test_out/bert_pretrain_results',
+            bert_model='bert-large-uncased',
             local_rank=self.local_rank,
             world_rank=self.world_rank,
             world_size=self.world_size,
-            max_steps=self.max_steps,
-            learning_rate=self.learning_rate,
-            max_seq_length=self.max_seq_length,
-            max_predictions_per_seq=self.max_predictions_per_seq,
-            train_batch_size=self.train_batch_size,
-            gradient_accumulation_steps=self.gradient_accumulation_steps,
-            input_dir=self.input_dir,
-            fp16=self.fp16,
-            allreduce_post_accumulation=self.allreduce_post_accumulation)
+            max_steps=10,
+            learning_rate=5e-4,
+            max_seq_length=process_args.sequence_length,
+            max_predictions_per_seq=process_args.max_predictions_per_seq,
+            train_batch_size=process_args.max_batch_size,
+            gradient_accumulation_steps=1,
+            fp16=process_args.enable_mixed_precision,
+            gelu_recompute=process_args.gelu_recompute,
+            attn_dropout_recompute=process_args.attn_dropout_recompute,
+            transformer_layer_recompute=process_args.transformer_layer_recompute,
+            allreduce_post_accumulation=True,
+            # TODO: remove
+            force_num_hidden_layers=2,
+        )
         do_pretrain(args)
 
     def test_pretrain_convergence(self):
@@ -621,8 +682,8 @@ class ORTBertPretrainTest(unittest.TestCase):
             fp16=self.fp16,
             allreduce_post_accumulation=self.allreduce_post_accumulation,
             force_num_hidden_layers=self.force_num_hidden_layers,
-            deepspeed_zero_stage = self.deepspeed_zero_stage,
-            save_checkpoint = True)
+            deepspeed_zero_stage=self.deepspeed_zero_stage,
+            save_checkpoint=True)
         train_loss = do_pretrain(args)
 
         # ensure all workers reach this point before loading the checkpointed state
@@ -633,7 +694,7 @@ class ORTBertPretrainTest(unittest.TestCase):
             checkpoint_files = _list_checkpoint_files(self.output_dir, "ORT_checkpoint")
             ckpt_agg = _CombineZeroCheckpoint(checkpoint_files)
             final_state_dict = ckpt_agg.aggregate_checkpoints()
-            
+
             args.init_state_dict = final_state_dict
 
         torch.distributed.barrier()
@@ -646,22 +707,31 @@ class ORTBertPretrainTest(unittest.TestCase):
         return final_loss
 
 
-# to do parallel training:
-# python -m torch.distributed.launch --nproc_per_node 4 orttraining_run_bert_pretrain.py
 if __name__ == "__main__":
     import sys
     logger.warning("sys.argv: %s", sys.argv)
     # usage:
-    #   mpirun -n 4 python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_throughput
-    #   mpirun -n 4 python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence
-    #   mpirun -n 4 python orttraining_run_bert_pretrain.py     # to run real BERT convergence test
-    # pytorch.distributed.launch will not work because ort backend requires MPI to broadcast ncclUniqueId
+    # data parallel training
+    #   mpirun -n 4 python orttraining_run_bert_pretrain.py 
     #
+    # single gpu:
+    # python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_throughput
+    #   [batch size test arguments]
+    # python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence
+    #
+    # pytorch.distributed.launch will not work because ort backend requires MPI to broadcast ncclUniqueId
     # calling unpublished get_mpi_context_xxx to get rank/size numbers.
-    from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_local_size, get_mpi_context_world_rank, get_mpi_context_world_size
-    world_size = get_mpi_context_world_size()
-    if world_size > 1:
-        print ('get_mpi_context_world_size(): ', world_size)
+    try:
+        # In case ORT is not built with MPI/NCCL, there are no get_mpi_context_xxx internal apis.
+        from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_local_size,\
+            get_mpi_context_world_rank, get_mpi_context_world_size
+        has_get_mpi_context_internal_api = True
+    except ImportError:
+        has_get_mpi_context_internal_api = False
+        pass
+    if has_get_mpi_context_internal_api and get_mpi_context_world_size() > 1:
+        world_size = get_mpi_context_world_size()
+        print('get_mpi_context_world_size(): ', world_size)
         local_rank = get_mpi_context_local_rank()
 
         if local_rank == 0:
@@ -673,19 +743,7 @@ if __name__ == "__main__":
         test.world_rank = local_rank
         test.world_size = world_size
 
-        if len(sys.argv) >= 2 and sys.argv[1] == 'ORTBertPretrainTest.test_pretrain_throughput':
-            logger.info("running ORTBertPretrainTest.test_pretrain_throughput()...")
-            test.test_pretrain_throughput()
-            logger.info("ORTBertPretrainTest.test_pretrain_throughput() passed")
-        elif len(sys.argv) >= 2 and sys.argv[1] == 'ORTBertPretrainTest.test_pretrain_convergence':
-            logger.info("running ORTBertPretrainTest.test_pretrain_convergence()...")
-            test.max_steps = 200
-            test.force_num_hidden_layers = 8
-            final_loss = test.test_pretrain_convergence()
-            logger.info("ORTBertPretrainTest.test_pretrain_convergence() final loss = %f", final_loss)
-            test.assertLess(final_loss, 8.5)
-            logger.info("ORTBertPretrainTest.test_pretrain_convergence() passed")
-        elif len(sys.argv) >= 2 and sys.argv[1] == 'ORTBertPretrainTest.test_pretrain_zero':
+        if len(sys.argv) >= 2 and sys.argv[1] == 'ORTBertPretrainTest.test_pretrain_zero':
             logger.info("running ORTBertPretrainTest.test_pretrain_zero()...")
             final_loss = test.test_pretrain_zero()
             logger.info("ORTBertPretrainTest.test_pretrain_zero() rank = %i final loss = %f", local_rank, final_loss)
@@ -694,37 +752,23 @@ if __name__ == "__main__":
             else:
                 test.assertGreater(final_loss, 11.0)
             logger.info("ORTBertPretrainTest.test_pretrain_zero() passed")
+        elif len(sys.argv) >= 2 and sys.argv[1] == 'ORTBertPretrainTest.test_pretrain_convergence':
+            logger.info("running ORTBertPretrainTest.test_pretrain_convergence()...")
+            test.max_steps = 200
+            test.force_num_hidden_layers = 8
+            final_loss = test.test_pretrain_convergence()
+            logger.info("ORTBertPretrainTest.test_pretrain_convergence() final loss = %f", final_loss)
+            test.assertLess(final_loss, 8.5)
+            logger.info("ORTBertPretrainTest.test_pretrain_convergence() passed")
         else:
             # https://microsoft.sharepoint.com/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc={170774be-e1c6-4f8b-a3ae-984f211fe410}&action=edit&wd=target%28ONNX%20Training.one%7C8176133b-c7cb-4ef2-aa9d-3fdad5344c40%2FGitHub%20Master%20Merge%20Schedule%7Cb67f0db1-e3a0-4add-80a6-621d67fd8107%2F%29
             # to make equivalent args for cpp convergence test
-
-            # ngpu=4 
-            # seq_len=128 
-            # max_predictions_per_seq=20 
-            # batch_size=64 
-            # grad_acc=16 
-            # num_train_steps=1000000 
-            # optimizer=adam
-            # lr=5e-4 
-            # warmup_ratio=0.1 
-            # warmup_mode=Linear 
-            # effective_batch_size=$((ngpu * batch_size * grad_acc)) 
-            # commit=$(git rev-parse HEAD | cut -c1-8) 
-            # time_now=$(date +%m%d%H%M) 
-            # run_name=ort_${commit}_nvbertbase_bookwiki128_fp16_${optimizer}_lr${lr}_${warmup_mode}${warmup_ratio}_g${ngpu}_bs${batch_size}_acc${grad_acc}_efbs${effective_batch_size}_steps${num_train_steps}_fp16allreduce_${time_now} 
-
-            # mixed precision 
-            # mpirun -n ${ngpu} ./onnxruntime_training_bert --model_name /bert_ort/bert_models/nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm
-            #   --train_data_dir /bert_data/128/books_wiki_en_corpus/train --test_data_dir /bert_data/128/books_wiki_en_corpus/test
-            #   --train_batch_size ${batch_size} --mode train --num_train_steps ${num_train_steps} --display_loss_steps 5 
-            #   --log_dir ./logs/bert_base/${run_name} --optimizer ${optimizer} --learning_rate ${lr} --warmup_ratio ${warmup_ratio} --warmup_mode ${warmup_mode} 
-            #   --gradient_accumulation_steps ${grad_acc} --max_predictions_per_seq=${max_predictions_per_seq} --use_mixed_precision --allreduce_in_fp16 --lambda 0
-            #   --use_nccl | tee ${run_name}.log 
-
             test.max_seq_length = 128
             test.max_predictions_per_seq = 20
             test.gradient_accumulation_steps = 16
-            test.train_batch_size = 64 * test.gradient_accumulation_steps * test.world_size    # cpp_batch_size (=64) * grad_acc * world_size
+
+            # cpp_batch_size (=64) * grad_acc * world_size
+            test.train_batch_size = 64 * test.gradient_accumulation_steps * test.world_size
             test.max_steps = 300000
 
             test.force_num_hidden_layers = None
@@ -736,4 +780,23 @@ if __name__ == "__main__":
             final_loss = test.test_pretrain_convergence()
             logger.info("ORTBertPretrainTest.test_pretrain_convergence() final loss = %f", final_loss)
     else:
-        unittest.main()
+        # unittest does not accept user defined arguments
+        # we need to run this script with user defined arguments
+        if len(sys.argv) >= 2 and sys.argv[1] == 'ORTBertPretrainTest.test_pretrain_throughput':
+            run_test_pretrain_throughput, run_test_pretrain_convergence = True, False
+            sys.argv.remove('ORTBertPretrainTest.test_pretrain_throughput')
+        elif len(sys.argv) >= 2 and sys.argv[1] == 'ORTBertPretrainTest.test_pretrain_convergence':
+            run_test_pretrain_throughput, run_test_pretrain_convergence = False, True
+            sys.argv.remove('ORTBertPretrainTest.test_pretrain_convergence')
+        else:
+            run_test_pretrain_throughput, run_test_pretrain_convergence = True, True
+        process_args = parse_arguments()
+        test = ORTBertPretrainTest()
+        test.setUp()
+
+        if run_test_pretrain_throughput:
+            logger.info("running single GPU ORTBertPretrainTest.test_pretrain_throughput()...")
+            test.test_pretrain_throughput(process_args)
+            logger.info("single GPU ORTBertPretrainTest.test_pretrain_throughput() passed")
+
+        # unittest.main()
diff --git a/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py b/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py
new file mode 100644
index 0000000000..3c8dfea216
--- /dev/null
+++ b/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py
@@ -0,0 +1,56 @@
+import sys
+import collections
+import subprocess
+
+Config = collections.namedtuple(
+    "Config",
+    [
+        "enable_mixed_precision",
+        "sequence_length",
+        "max_batch_size",
+        "max_predictions_per_seq",
+        "gelu_recompute",
+        "attn_dropout_recompute",
+        "transformer_layer_recompute"])
+
+configs = [
+    Config(True, 128, 46, 20, False, False, False),
+    Config(True, 512, 8, 80, False, False, False),
+    Config(False, 128, 26, 20, False, False, False),
+    Config(False, 512, 4, 80, False, False, False),
+    Config(True, 128, 50, 20, True, False, False),
+    Config(True, 128, 50, 20, False, True, False),
+    Config(True, 128, 76, 20, False, False, True),
+    Config(True, 512, 8, 80, True, False, False),
+    Config(True, 512, 9, 80, False, True, False),
+    Config(True, 512, 15, 80, False, False, True),
+]
+
+def run_with_config(config):
+    print("##### testing name - {}-{} #####".format("fp16" if config.enable_mixed_precision else "fp32",
+                                                        config.sequence_length))
+    print("gelu_recompute: ", config.gelu_recompute)
+    print("attn_dropout_recompute: ", config.attn_dropout_recompute)
+    print("transformer_layer_recompute: ", config.transformer_layer_recompute)
+
+    cmds = [
+        sys.executable,
+        'orttraining_run_bert_pretrain.py',
+        "ORTBertPretrainTest.test_pretrain_throughput",
+        "--sequence_length", str(config.sequence_length),
+        "--max_batch_size", str(config.max_batch_size),
+        "--max_predictions_per_seq", str(config.max_predictions_per_seq)]
+    if config.enable_mixed_precision:
+        cmds.append("--enable_mixed_precision")
+    if config.gelu_recompute:
+        cmds.append("--gelu_recompute")
+    if config.attn_dropout_recompute:
+        cmds.append("--attn_dropout_recompute")
+    if config.transformer_layer_recompute:
+        cmds.append("--transformer_layer_recompute")
+    subprocess.run(cmds, timeout=1200).check_returncode()
+
+for config in configs:
+    run_with_config(config)
+
+
diff --git a/orttraining/orttraining/test/python/orttraining_run_glue.py b/orttraining/orttraining/test/python/orttraining_run_glue.py
index cf7c3fcbfd..b96a55756a 100644
--- a/orttraining/orttraining/test/python/orttraining_run_glue.py
+++ b/orttraining/orttraining/test/python/orttraining_run_glue.py
@@ -124,7 +124,7 @@ class ORTGlueTest(unittest.TestCase):
     def test_bert_fp16_with_mrpc(self):
         expected_acc = 0.84
         expected_f1 = 0.88
-        expected_loss = 0.40
+        expected_loss = 0.44
 
         results = self.run_glue(model_name="bert-base-cased", task_name="MRPC", fp16=True)
 
diff --git a/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py b/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
index c1e8ebca95..256f4a59ed 100644
--- a/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
+++ b/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
@@ -90,6 +90,7 @@ class ORTMultipleChoiceTest(unittest.TestCase):
         self.output_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "multiple_choice_test_output/")
         self.cache_dir = '/tmp/multiple_choice/'
         self.logging_steps = 10
+        self.rtol = 2e-01
 
     def test_bert_with_swag(self):
         expected_acc = 0.75
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index d02cf6f56c..26d6e3468a 100755
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1180,16 +1180,15 @@ def run_training_python_frontend_e2e_tests(cwd):
     # frontend tests are to be added here:
     log.info("Running python frontend e2e tests.")
 
+    run_subprocess(
+        [sys.executable, 'orttraining_run_frontend_batch_size_test.py', '-v'],
+        cwd=cwd, env={'CUDA_VISIBLE_DEVICES': '0'})
+
     import torch
     ngpus = torch.cuda.device_count()
     if ngpus > 1:
         bert_pretrain_script = 'orttraining_run_bert_pretrain.py'
-        log.debug('RUN: mpirun -n {} ''-x' 'NCCL_DEBUG=INFO'' {} {} {}'.format(
-            ngpus, sys.executable, bert_pretrain_script, 'ORTBertPretrainTest.test_pretrain_throughput'))
-        run_subprocess([
-            'mpirun', '-n', str(ngpus), '-x', 'NCCL_DEBUG=INFO', sys.executable,
-            bert_pretrain_script, 'ORTBertPretrainTest.test_pretrain_throughput'], cwd=cwd)
-
+        # TODO: this test will be replaced with convergence test ported from backend
         log.debug('RUN: mpirun -n {} ''-x' 'NCCL_DEBUG=INFO'' {} {} {}'.format(
             ngpus, sys.executable, bert_pretrain_script, 'ORTBertPretrainTest.test_pretrain_convergence'))
         run_subprocess([
@@ -1231,7 +1230,8 @@ def run_training_python_frontend_e2e_tests(cwd):
         sys.executable, 'orttraining_test_transformers.py',
         'BertModelTest.test_for_pretraining_mixed_precision'], cwd=cwd)
 
-    # this test is not stable. need to skip to unblock release
+    # this test is not stable. it occasionally causes segfault due to its session creation/release pattern.
+    # need to skip to unblock release
     # run_subprocess([
     #     sys.executable, 'orttraining_test_transformers.py',
     #     'BertModelTest.test_for_pretraining_mixed_precision_with_gradient_accumulation'], cwd=cwd)