From 1ddfe1249bd7e26940ff8a5319cc3722dc53e78e Mon Sep 17 00:00:00 2001 From: liqunfu Date: Wed, 8 Apr 2020 10:03:07 -0700 Subject: [PATCH] frontend test to use random seed (#3209) frontend test to use random seed --- .../python/onnxruntime_test_ort_trainer.py | 40 +++++++++---------- orttraining/orttraining/python/ort_trainer.py | 13 +++++- .../python/orttraining_pybind_state.cc | 3 +- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py index d50c1aca85..7c993929b9 100644 --- a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py +++ b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py @@ -38,7 +38,6 @@ def bert_model_description(): num_classes=vocab_size) next_sentence_labels_desc = IODescription('next_sentence_labels', ['batch', ], torch.int64, num_classes=2) loss_desc = IODescription('loss', [], torch.float32) - # probability_desc = IODescription('probability', ['batch', 10], torch.float32) return ModelDescription([input_ids_desc, segment_ids_desc, input_mask_desc, masked_lm_labels_desc, next_sentence_labels_desc], [loss_desc]) @@ -72,7 +71,8 @@ def runBertTrainingTest(gradient_accumulation_steps, use_mixed_precision, allred gradient_accumulation_steps=gradient_accumulation_steps, world_rank=0, world_size=1, use_mixed_precision=use_mixed_precision, - allreduce_post_accumulation=allreduce_post_accumulation) + allreduce_post_accumulation=allreduce_post_accumulation, + seed=1) loss_scaler = LossScaler(model.loss_scale_input_name, True) @@ -134,44 +134,42 @@ class TestOrtTrainer(unittest.TestCase): def testBertTrainingBasic(self): torch.manual_seed(1) expected_losses = [ - 11.050175666809082, 11.16925048828125, 11.017821311950684, 11.052311897277832, - 10.89547061920166, 10.996326446533203, 11.079578399658203, 10.966521263122559] - expected_eval_loss = [11.05634880065918] + 11.032349586486816, 11.165414810180664, 11.018413543701172, 11.050261497497559, + 10.855697631835938, 10.947554588317871, 11.083847999572754, 10.97836685180664] + expected_eval_loss = [10.972074508666992] actual_losses, actual_eval_loss = runBertTrainingTest( gradient_accumulation_steps=1, use_mixed_precision=False, allreduce_post_accumulation=False) # to update expected outcomes, enable pdb and run the test with -s and copy paste outputs - # print('actual_losses ', actual_losses) - # print('eval_loss', actual_eval_loss) + print('actual_losses ', actual_losses) + print('eval_loss', actual_eval_loss) # import pdb; pdb.set_trace() - rtol = 1e-01 - assert_allclose(expected_losses, actual_losses, rtol=rtol, err_msg="loss mismatch") - assert_allclose(expected_eval_loss, actual_eval_loss, rtol=rtol, err_msg="evaluation loss mismatch") + assert_allclose(expected_losses, actual_losses, err_msg="loss mismatch") + assert_allclose(expected_eval_loss, actual_eval_loss, err_msg="evaluation loss mismatch") def testBertTrainingGradientAccumulation(self): torch.manual_seed(1) # this commented expected results are for runing test individually (pytest with -k). # expected_losses = [ - # 11.050175666809082, 11.16925048828125, 11.017815589904785, 11.0523099899292, - # 10.895469665527344, 10.996331214904785, 11.079588890075684, 10.966512680053711] - # expected_eval_loss = [11.05636978149414] + # 11.071269035339355, 10.996841430664062, 11.06226921081543, 10.981647491455078, + # 11.032355308532715, 11.04256534576416, 10.976116180419922, 11.065701484680176] + # expected_eval_loss = [10.991236686706543] expected_losses = [ - 11.041119575500488, 11.142148971557617, 11.022183418273926, 11.047553062438965, - 10.866510391235352, 10.95550537109375, 11.083690643310547, 11.002318382263184] - expected_eval_loss = [10.977485656738281] + 11.026690483093262, 11.117761611938477, 11.010371208190918, 11.068782806396484, + 10.894888877868652, 10.923206329345703, 11.06037425994873, 11.008777618408203] + expected_eval_loss = [11.011880874633789] actual_losses, actual_eval_loss = runBertTrainingTest( gradient_accumulation_steps=4, use_mixed_precision=False, allreduce_post_accumulation=False) # to update expected outcomes, enable pdb and run the test with -s and copy paste outputs - # print('actual_losses ', actual_losses) - # print('eval_loss', actual_eval_loss) + print('actual_losses ', actual_losses) + print('eval_loss', actual_eval_loss) # import pdb; pdb.set_trace() - rtol = 1e-01 - assert_allclose(expected_losses, actual_losses, rtol=rtol, err_msg="loss mismatch") - assert_allclose(expected_eval_loss, actual_eval_loss, rtol=rtol, err_msg="evaluation loss mismatch") + assert_allclose(expected_losses, actual_losses, err_msg="loss mismatch") + assert_allclose(expected_eval_loss, actual_eval_loss, err_msg="evaluation loss mismatch") def testBertTrainingMixedPrecision(self): # skip the test due to the lack of mixed precision capacity of ort CI. diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py index df01f86b2a..46df724b6a 100644 --- a/orttraining/orttraining/python/ort_trainer.py +++ b/orttraining/orttraining/python/ort_trainer.py @@ -384,7 +384,9 @@ def create_ort_training_session_with_optimizer(model, device, training_optimizer map_optimizer_attributes, world_rank=-1, world_size=1, gradient_accumulation_steps=1, bind_parameters=False, use_mixed_precision=False, allreduce_post_accumulation=False, - partition_optimizer=False, enable_grad_norm_clip=True, + partition_optimizer=False, + enable_grad_norm_clip=True, + seed=None, frozen_weights=[]): output_name = model.graph.output[0].name ort_parameters = ort.TrainingParameters() @@ -396,6 +398,8 @@ def create_ort_training_session_with_optimizer(model, device, training_optimizer ort_parameters.use_mixed_precision = use_mixed_precision ort_parameters.allreduce_post_accumulation = allreduce_post_accumulation ort_parameters.partition_optimizer = partition_optimizer + if seed is not None: + ort_parameters.seed = seed ort_parameters.enable_grad_norm_clip = enable_grad_norm_clip output_types = {} @@ -516,6 +520,7 @@ class ORTTrainer(): learning_rate_description, device, gradient_accumulation_steps=1, postprocess_model=None, world_rank=0, world_size=1, use_mixed_precision=False, allreduce_post_accumulation=False, global_step=0, get_lr_this_step=None, loss_scaler=None, partition_optimizer=False, + seed=None, enable_grad_norm_clip=True, frozen_weights=[]): super(ORTTrainer, self).__init__() """ @@ -546,6 +551,7 @@ class ORTTrainer(): use_mixed_precision: allreduce_post_accumulation: partition_optimizer: Whether to partition the optimizer state. (default=False) + seed: allow user code to set backend static random seed. """ self.is_train = True @@ -593,6 +599,7 @@ class ORTTrainer(): self.enable_grad_norm_clip_ = enable_grad_norm_clip self.frozen_weights_ = frozen_weights self.loss_scale_input_name = '' + self.seed_ = seed self._init_session() @@ -608,7 +615,9 @@ class ORTTrainer(): self.world_rank, self.world_size, self.gradient_accumulation_steps, bind_parameters=False, use_mixed_precision=self.use_mixed_precision, allreduce_post_accumulation=self.allreduce_post_accumulation_, - partition_optimizer=self.partition_optimizer_, enable_grad_norm_clip=self.enable_grad_norm_clip_, + partition_optimizer=self.partition_optimizer_, + enable_grad_norm_clip=self.enable_grad_norm_clip_, + seed=self.seed_, frozen_weights=self.frozen_weights_) self.loss_scale_input_name = self.session.loss_scale_input_name diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc index 341a53bf70..072dc53337 100644 --- a/orttraining/orttraining/python/orttraining_pybind_state.cc +++ b/orttraining/orttraining/python/orttraining_pybind_state.cc @@ -187,7 +187,8 @@ void addObjectMethodsForTraining(py::module& m) { .def_readwrite("world_size", &TrainingParameters::world_size) .def_readwrite("gradient_accumulation_steps", &TrainingParameters::gradient_accumulation_steps) .def_readwrite("partition_optimizer", &TrainingParameters::partition_optimizer) - .def_readwrite("enable_grad_norm_clip", &TrainingParameters::enable_grad_norm_clip); + .def_readwrite("enable_grad_norm_clip", &TrainingParameters::enable_grad_norm_clip) + .def_readwrite("seed", &TrainingParameters::seed); py::class_ config_result(m, "TrainingConfigurationResult", "pbdoc(Configuration result for training.)pbdoc"); config_result.def(py::init())