From 41aa2b4ef1b9c2d14d5b06af2e0faa10592779dd Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Thu, 27 Aug 2020 11:16:50 +0200 Subject: [PATCH] Adafactor docs (#6765) --- .../main_classes/optimizer_schedules.rst | 5 ++ src/transformers/optimization.py | 62 +++++++++++++------ 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst index 1df71adfa..998100075 100644 --- a/docs/source/main_classes/optimizer_schedules.rst +++ b/docs/source/main_classes/optimizer_schedules.rst @@ -13,6 +13,11 @@ The ``.optimization`` module provides: .. autoclass:: transformers.AdamW :members: +``AdaFactor`` (PyTorch) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.Adafactor + ``AdamWeightDecay`` (TensorFlow) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index defafc51f..12da9a32d 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -328,31 +328,57 @@ class Adafactor(Optimizer): *warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and `relative_step=False`. Arguments: - params (iterable): iterable of parameters to optimize or dicts defining parameter groups - lr (float, optional): external learning rate (default: None) - eps (tuple[float, float]): regularization constants for square gradient - and parameter scale respectively (default: (1e-30, 1e-3)) - clip_threshold (float, default 1.0): threshold of root mean square of final gradient update - decay_rate (float, default: -0.8): coefficient used to compute running averages of square - beta1 (float): coefficient used for computing running averages of gradient - weight_decay (float, default=0): weight decay (L2 penalty) - scale_parameter (bool, default: True): if True, learning rate is scaled by root mean square of - relative_step (bool, default: True): if True, time-dependent learning rate is computed instead of external learning rate - warmup_init (bool, default: False): time-dependent learning rate computation depends on whether warm-up initialization is being used + params (:obj:`Iterable[torch.nn.parameter.Parameter]`): + Iterable of parameters to optimize or dictionaries defining parameter groups. + lr (:obj:`float`, `optional`): + The external learning rate. + eps (:obj:`Tuple[float, float]`, `optional`, defaults to (1e-30, 1e-3)): + Regularization constants for square gradient and parameter scale respectively + clip_threshold (:obj:`float`, `optional`, defaults 1.0): + Threshold of root mean square of final gradient update + decay_rate (:obj:`float`, `optional`, defaults to -0.8): + Coefficient used to compute running averages of square + beta1 (:obj:`float`, `optional`): + Coefficient used for computing running averages of gradient + weight_decay (:obj:`float`, `optional`, defaults to 0): + Weight decay (L2 penalty) + scale_parameter (:obj:`bool`, `optional`, defaults to :obj:`True`): + If True, learning rate is scaled by root mean square + relative_step (:obj:`bool`, `optional`, defaults to :obj:`True`): + If True, time-dependent learning rate is computed instead of external learning rate + warmup_init (:obj:`bool`, `optional`, defaults to False): + Time-dependent learning rate computation depends on whether warm-up initialization is being used This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested. Recommended T5 finetuning settings: - scheduled LR warm-up to fixed LR, disable relative updates, use clip threshold: https://arxiv.org/abs/2004.14546 - Adafactor(model.parameters(), lr=1e-3, relative_step=False, warmup_init=True) - Alternatively, relative_step with warmup_init can be used. - Training without LR warmup or clip threshold, is not recommended. Additional optimizer operations like gradient clipping, should not be used alongside Adafactor. + - Scheduled LR warm-up to fixed LR + - disable relative updates + - use clip threshold: https://arxiv.org/abs/2004.14546 + + Example:: + + Adafactor(model.parameters(), lr=1e-3, relative_step=False, warmup_init=True) + + - Alternatively, relative_step with warmup_init can be used. + - Training without LR warmup or clip threshold is not recommended. Additional optimizer operations like + gradient clipping should not be used alongside Adafactor. Usage:: + # replace AdamW with Adafactor - optimizer = Adafactor(model.parameters(), lr=1e-3, eps=(1e-30, 1e-3), clip_threshold=1.0, - decay_rate=-0.8, beta1=None, weight_decay=0.0, relative_step=False, - scale_parameter=False, warmup_init=False,) + optimizer = Adafactor( + model.parameters(), + lr=1e-3, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=None, + weight_decay=0.0, + relative_step=False, + scale_parameter=False, + warmup_init=False + ) """ def __init__(