From 1ddfe1249bd7e26940ff8a5319cc3722dc53e78e Mon Sep 17 00:00:00 2001
From: liqunfu <liqfu@microsoft.com>
Date: Wed, 8 Apr 2020 10:03:07 -0700
Subject: [PATCH] frontend test to use random seed (#3209)

frontend test to use random seed
---
 .../python/onnxruntime_test_ort_trainer.py    | 40 +++++++++----------
 orttraining/orttraining/python/ort_trainer.py | 13 +++++-
 .../python/orttraining_pybind_state.cc        |  3 +-
 3 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
index d50c1aca85..7c993929b9 100644
--- a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
+++ b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
@@ -38,7 +38,6 @@ def bert_model_description():
                                           num_classes=vocab_size)
     next_sentence_labels_desc = IODescription('next_sentence_labels', ['batch', ], torch.int64, num_classes=2)
     loss_desc = IODescription('loss', [], torch.float32)
-    # probability_desc = IODescription('probability', ['batch', 10], torch.float32)
 
     return ModelDescription([input_ids_desc, segment_ids_desc, input_mask_desc, masked_lm_labels_desc,
                              next_sentence_labels_desc], [loss_desc])
@@ -72,7 +71,8 @@ def runBertTrainingTest(gradient_accumulation_steps, use_mixed_precision, allred
                        gradient_accumulation_steps=gradient_accumulation_steps,
                        world_rank=0, world_size=1,
                        use_mixed_precision=use_mixed_precision,
-                       allreduce_post_accumulation=allreduce_post_accumulation)
+                       allreduce_post_accumulation=allreduce_post_accumulation,
+                       seed=1)
 
     loss_scaler = LossScaler(model.loss_scale_input_name, True)
 
@@ -134,44 +134,42 @@ class TestOrtTrainer(unittest.TestCase):
     def testBertTrainingBasic(self):
         torch.manual_seed(1)
         expected_losses = [
-            11.050175666809082, 11.16925048828125, 11.017821311950684, 11.052311897277832,
-            10.89547061920166, 10.996326446533203, 11.079578399658203, 10.966521263122559]
-        expected_eval_loss = [11.05634880065918]
+            11.032349586486816, 11.165414810180664, 11.018413543701172, 11.050261497497559,
+            10.855697631835938, 10.947554588317871, 11.083847999572754, 10.97836685180664]
+        expected_eval_loss = [10.972074508666992]
         actual_losses, actual_eval_loss = runBertTrainingTest(
             gradient_accumulation_steps=1, use_mixed_precision=False, allreduce_post_accumulation=False)
 
         # to update expected outcomes, enable pdb and run the test with -s and copy paste outputs
-        # print('actual_losses ', actual_losses)
-        # print('eval_loss', actual_eval_loss)
+        print('actual_losses ', actual_losses)
+        print('eval_loss', actual_eval_loss)
         # import pdb; pdb.set_trace()
 
-        rtol = 1e-01
-        assert_allclose(expected_losses, actual_losses, rtol=rtol, err_msg="loss mismatch")
-        assert_allclose(expected_eval_loss, actual_eval_loss, rtol=rtol, err_msg="evaluation loss mismatch")
+        assert_allclose(expected_losses, actual_losses, err_msg="loss mismatch")
+        assert_allclose(expected_eval_loss, actual_eval_loss, err_msg="evaluation loss mismatch")
 
     def testBertTrainingGradientAccumulation(self):
         torch.manual_seed(1)
         # this commented expected results are for runing test individually (pytest with -k). 
         # expected_losses = [
-        #     11.050175666809082, 11.16925048828125, 11.017815589904785, 11.0523099899292, 
-        #     10.895469665527344, 10.996331214904785, 11.079588890075684, 10.966512680053711]
-        # expected_eval_loss = [11.05636978149414]
+        #     11.071269035339355, 10.996841430664062, 11.06226921081543, 10.981647491455078,
+        #     11.032355308532715, 11.04256534576416, 10.976116180419922, 11.065701484680176]
+        # expected_eval_loss = [10.991236686706543]
         expected_losses = [
-            11.041119575500488, 11.142148971557617, 11.022183418273926, 11.047553062438965,
-            10.866510391235352, 10.95550537109375, 11.083690643310547, 11.002318382263184]
-        expected_eval_loss = [10.977485656738281]
+            11.026690483093262, 11.117761611938477, 11.010371208190918, 11.068782806396484,
+            10.894888877868652, 10.923206329345703, 11.06037425994873, 11.008777618408203]
+        expected_eval_loss = [11.011880874633789]
         
         actual_losses, actual_eval_loss = runBertTrainingTest(
             gradient_accumulation_steps=4, use_mixed_precision=False, allreduce_post_accumulation=False)
 
         # to update expected outcomes, enable pdb and run the test with -s and copy paste outputs
-        # print('actual_losses ', actual_losses)
-        # print('eval_loss', actual_eval_loss)
+        print('actual_losses ', actual_losses)
+        print('eval_loss', actual_eval_loss)
         # import pdb; pdb.set_trace()
 
-        rtol = 1e-01
-        assert_allclose(expected_losses, actual_losses, rtol=rtol, err_msg="loss mismatch")
-        assert_allclose(expected_eval_loss, actual_eval_loss, rtol=rtol, err_msg="evaluation loss mismatch")
+        assert_allclose(expected_losses, actual_losses, err_msg="loss mismatch")
+        assert_allclose(expected_eval_loss, actual_eval_loss, err_msg="evaluation loss mismatch")
 
     def testBertTrainingMixedPrecision(self):
         # skip the test due to the lack of mixed precision capacity of ort CI.
diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py
index df01f86b2a..46df724b6a 100644
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@@ -384,7 +384,9 @@ def create_ort_training_session_with_optimizer(model, device, training_optimizer
                                                map_optimizer_attributes, world_rank=-1, world_size=1,
                                                gradient_accumulation_steps=1, bind_parameters=False,
                                                use_mixed_precision=False, allreduce_post_accumulation=False,
-                                               partition_optimizer=False, enable_grad_norm_clip=True,
+                                               partition_optimizer=False,
+                                               enable_grad_norm_clip=True,
+                                               seed=None,
                                                frozen_weights=[]):
     output_name = model.graph.output[0].name
     ort_parameters = ort.TrainingParameters()
@@ -396,6 +398,8 @@ def create_ort_training_session_with_optimizer(model, device, training_optimizer
     ort_parameters.use_mixed_precision = use_mixed_precision
     ort_parameters.allreduce_post_accumulation = allreduce_post_accumulation
     ort_parameters.partition_optimizer = partition_optimizer
+    if seed is not None:
+        ort_parameters.seed = seed
     ort_parameters.enable_grad_norm_clip = enable_grad_norm_clip
 
     output_types = {}
@@ -516,6 +520,7 @@ class ORTTrainer():
                  learning_rate_description, device, gradient_accumulation_steps=1, postprocess_model=None,
                  world_rank=0, world_size=1, use_mixed_precision=False, allreduce_post_accumulation=False,
                  global_step=0, get_lr_this_step=None, loss_scaler=None, partition_optimizer=False,
+                 seed=None,
                  enable_grad_norm_clip=True, frozen_weights=[]):
         super(ORTTrainer, self).__init__()
         """
@@ -546,6 +551,7 @@ class ORTTrainer():
             use_mixed_precision:
             allreduce_post_accumulation:
             partition_optimizer: Whether to partition the optimizer state. (default=False)
+            seed: allow user code to set backend static random seed.
         """
         self.is_train = True
 
@@ -593,6 +599,7 @@ class ORTTrainer():
         self.enable_grad_norm_clip_ = enable_grad_norm_clip
         self.frozen_weights_ = frozen_weights
         self.loss_scale_input_name = ''
+        self.seed_ = seed
 
         self._init_session()
 
@@ -608,7 +615,9 @@ class ORTTrainer():
                 self.world_rank, self.world_size,
                 self.gradient_accumulation_steps, bind_parameters=False,
                 use_mixed_precision=self.use_mixed_precision, allreduce_post_accumulation=self.allreduce_post_accumulation_,
-                partition_optimizer=self.partition_optimizer_, enable_grad_norm_clip=self.enable_grad_norm_clip_,
+                partition_optimizer=self.partition_optimizer_, 
+                enable_grad_norm_clip=self.enable_grad_norm_clip_,
+                seed=self.seed_,
                 frozen_weights=self.frozen_weights_)
 
         self.loss_scale_input_name = self.session.loss_scale_input_name
diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc
index 341a53bf70..072dc53337 100644
--- a/orttraining/orttraining/python/orttraining_pybind_state.cc
+++ b/orttraining/orttraining/python/orttraining_pybind_state.cc
@@ -187,7 +187,8 @@ void addObjectMethodsForTraining(py::module& m) {
       .def_readwrite("world_size", &TrainingParameters::world_size)
       .def_readwrite("gradient_accumulation_steps", &TrainingParameters::gradient_accumulation_steps)
       .def_readwrite("partition_optimizer", &TrainingParameters::partition_optimizer)
-      .def_readwrite("enable_grad_norm_clip", &TrainingParameters::enable_grad_norm_clip);
+      .def_readwrite("enable_grad_norm_clip", &TrainingParameters::enable_grad_norm_clip)
+      .def_readwrite("seed", &TrainingParameters::seed);
 
   py::class_<TrainingConfigurationResult> config_result(m, "TrainingConfigurationResult", "pbdoc(Configuration result for training.)pbdoc");
   config_result.def(py::init())