From 553029909620455e040a49032a9c45f6a5f0cd52 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 30 Nov 2020 11:12:15 -0500 Subject: [PATCH] Remove deprecated `evalutate_during_training` (#8852) * Remove deprecated `evalutate_during_training` * Update src/transformers/training_args_tf.py Co-authored-by: Lysandre Debut Co-authored-by: Lysandre Debut --- examples/seq2seq/builtin_trainer/finetune.sh | 3 ++- examples/seq2seq/builtin_trainer/finetune_tpu.sh | 3 ++- .../seq2seq/builtin_trainer/train_distil_marian_enro.sh | 3 ++- .../builtin_trainer/train_distil_marian_enro_tpu.sh | 3 ++- examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh | 3 ++- examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh | 3 ++- src/transformers/integrations.py | 5 +++-- src/transformers/trainer_tf.py | 4 ++-- src/transformers/training_args_tf.py | 8 ++++++-- 9 files changed, 23 insertions(+), 12 deletions(-) diff --git a/examples/seq2seq/builtin_trainer/finetune.sh b/examples/seq2seq/builtin_trainer/finetune.sh index 65f207c21..8c2d13d5a 100644 --- a/examples/seq2seq/builtin_trainer/finetune.sh +++ b/examples/seq2seq/builtin_trainer/finetune.sh @@ -3,7 +3,8 @@ python finetune_trainer.py \ --learning_rate=3e-5 \ --fp16 \ - --do_train --do_eval --do_predict --evaluate_during_training \ + --do_train --do_eval --do_predict \ + --evaluation_strategy steps \ --predict_with_generate \ --n_val 1000 \ "$@" diff --git a/examples/seq2seq/builtin_trainer/finetune_tpu.sh b/examples/seq2seq/builtin_trainer/finetune_tpu.sh index 8bd367c85..577f99fc7 100644 --- a/examples/seq2seq/builtin_trainer/finetune_tpu.sh +++ b/examples/seq2seq/builtin_trainer/finetune_tpu.sh @@ -5,7 +5,8 @@ export TPU_NUM_CORES=8 python xla_spawn.py --num_cores $TPU_NUM_CORES \ finetune_trainer.py \ --learning_rate=3e-5 \ - --do_train --do_eval --evaluate_during_training \ + --do_train --do_eval \ + --evaluation_strategy steps \ --prediction_loss_only \ --n_val 1000 \ "$@" diff --git a/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh b/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh index 1503e821a..10c809b0e 100644 --- a/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh +++ b/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh @@ -16,7 +16,8 @@ python finetune_trainer.py \ --num_train_epochs=6 \ --save_steps 3000 --eval_steps 3000 \ --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ - --do_train --do_eval --do_predict --evaluate_during_training\ + --do_train --do_eval --do_predict \ + --evaluation_strategy steps \ --predict_with_generate --logging_first_step \ --task translation --label_smoothing 0.1 \ "$@" diff --git a/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh b/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh index ca9a57fa4..098425d65 100644 --- a/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh +++ b/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh @@ -17,7 +17,8 @@ python xla_spawn.py --num_cores $TPU_NUM_CORES \ --save_steps 500 --eval_steps 500 \ --logging_first_step --logging_steps 200 \ --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ - --do_train --do_eval --evaluate_during_training \ + --do_train --do_eval \ + --evaluation_strategy steps \ --prediction_loss_only \ --task translation --label_smoothing 0.1 \ "$@" diff --git a/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh b/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh index dbb85cbe1..d29f6b803 100644 --- a/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh +++ b/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh @@ -19,6 +19,7 @@ python finetune_trainer.py \ --save_steps 3000 --eval_steps 3000 \ --logging_first_step \ --max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \ - --do_train --do_eval --do_predict --evaluate_during_training \ + --do_train --do_eval --do_predict \ + --evaluation_strategy steps \ --predict_with_generate --sortish_sampler \ "$@" diff --git a/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh b/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh index 7a2a5c722..3dc711f20 100644 --- a/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh +++ b/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh @@ -15,7 +15,8 @@ python finetune_trainer.py \ --sortish_sampler \ --num_train_epochs 6 \ --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \ - --do_train --do_eval --do_predict --evaluate_during_training \ + --do_train --do_eval --do_predict \ + --evaluation_strategy steps \ --predict_with_generate --logging_first_step \ --task translation \ "$@" diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index d14e6e7ce..4c813e1ce 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -2,6 +2,7 @@ import math import os +from .trainer_utils import EvaluationStrategy from .utils import logging @@ -212,13 +213,13 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR # Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting. if isinstance( kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining) - ) and (not trainer.args.do_eval or not trainer.args.evaluate_during_training): + ) and (not trainer.args.do_eval or trainer.args.evaluation_strategy == EvaluationStrategy.NO): raise RuntimeError( "You are using {cls} as a scheduler but you haven't enabled evaluation during training. " "This means your trials will not report intermediate results to Ray Tune, and " "can thus not be stopped early or used to exploit other trials parameters. " "If this is what you want, do not use {cls}. If you would like to use {cls}, " - "make sure you pass `do_eval=True` and `evaluate_during_training=True` in the " + "make sure you pass `do_eval=True` and `evaluation_strategy='steps'` in the " "Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__) ) diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 6275ceafe..162815dbc 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -19,7 +19,7 @@ from tensorflow.python.distribute.values import PerReplica from .modeling_tf_utils import TFPreTrainedModel from .optimization_tf import GradientAccumulator, create_optimizer -from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, set_seed +from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, EvaluationStrategy, PredictionOutput, set_seed from .training_args_tf import TFTrainingArguments from .utils import logging @@ -561,7 +561,7 @@ class TFTrainer: if ( self.args.eval_steps > 0 - and self.args.evaluate_during_training + and self.args.evaluate_strategy == EvaluationStrategy.STEPS and self.global_step % self.args.eval_steps == 0 ): self.evaluate() diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 91890605d..2efe7a6be 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -34,8 +34,12 @@ class TFTrainingArguments(TrainingArguments): Whether to run evaluation on the dev set or not. do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to run predictions on the test set or not. - evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether to run evaluation during training at each logging step or not. + evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.EvaluationStrategy`, `optional`, defaults to :obj:`"no"`): + The evaluation strategy to adopt during training. Possible values are: + + * :obj:`"no"`: No evaluation is done during training. + * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`. + per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8): The batch size per GPU/TPU core/CPU for training. per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):