From 4611387608228c7248be8a4aa59bcc4d97d83aa2 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Thu, 15 Apr 2021 20:05:14 -0700 Subject: [PATCH] [optim] take kw-only argument for functional optim APIs (#56185) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/56185 ghstack-source-id: 126670123 Reviewed By: albanD Differential Revision: D27802169 fbshipit-source-id: f5e1cb2046dcdeecf5f6b0f70892828bf0adb22f --- torch/distributed/optim/functional_adadelta.py | 8 ++++---- torch/distributed/optim/functional_adagrad.py | 8 ++++---- torch/distributed/optim/functional_adam.py | 12 ++++++------ torch/distributed/optim/functional_adamax.py | 10 +++++----- torch/distributed/optim/functional_adamw.py | 12 ++++++------ torch/distributed/optim/functional_rmsprop.py | 12 ++++++------ torch/distributed/optim/functional_rprop.py | 8 ++++---- torch/distributed/optim/functional_sgd.py | 10 +++++----- torch/optim/_functional.py | 8 ++++++++ torch/optim/adadelta.py | 8 ++++---- torch/optim/adagrad.py | 8 ++++---- torch/optim/adam.py | 12 ++++++------ torch/optim/adamax.py | 10 +++++----- torch/optim/adamw.py | 12 ++++++------ torch/optim/rmsprop.py | 12 ++++++------ torch/optim/rprop.py | 8 ++++---- torch/optim/sgd.py | 10 +++++----- 17 files changed, 88 insertions(+), 80 deletions(-) diff --git a/torch/distributed/optim/functional_adadelta.py b/torch/distributed/optim/functional_adadelta.py index d412e4d0813..8bb3082042f 100644 --- a/torch/distributed/optim/functional_adadelta.py +++ b/torch/distributed/optim/functional_adadelta.py @@ -76,7 +76,7 @@ class _FunctionalAdadelta(object): grads, square_avgs, acc_deltas, - lr, - rho, - eps, - weight_decay) + lr=lr, + rho=rho, + eps=eps, + weight_decay=weight_decay) diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py index fc28450f802..610f2040e5a 100644 --- a/torch/distributed/optim/functional_adagrad.py +++ b/torch/distributed/optim/functional_adagrad.py @@ -84,7 +84,7 @@ class _FunctionalAdagrad(object): grads, state_sums, state_steps, - self.defaults['lr'], - self.defaults['weight_decay'], - self.defaults['lr_decay'], - self.defaults['eps']) + lr=self.defaults['lr'], + weight_decay=self.defaults['weight_decay'], + lr_decay=self.defaults['lr_decay'], + eps=self.defaults['eps']) diff --git a/torch/distributed/optim/functional_adam.py b/torch/distributed/optim/functional_adam.py index 33638e6bde6..4a39d0104bf 100644 --- a/torch/distributed/optim/functional_adam.py +++ b/torch/distributed/optim/functional_adam.py @@ -105,9 +105,9 @@ class _FunctionalAdam(object): exp_avg_sqs, max_exp_avg_sqs, state_steps, - self.amsgrad, - self.defaults['beta1'], - self.defaults['beta2'], - self.defaults['lr'], - self.defaults['weight_decay'], - self.defaults['eps']) + amsgrad=self.amsgrad, + beta1=self.defaults['beta1'], + beta2=self.defaults['beta2'], + lr=self.defaults['lr'], + weight_decay=self.defaults['weight_decay'], + eps=self.defaults['eps']) diff --git a/torch/distributed/optim/functional_adamax.py b/torch/distributed/optim/functional_adamax.py index 7daa315636e..18f3936b797 100644 --- a/torch/distributed/optim/functional_adamax.py +++ b/torch/distributed/optim/functional_adamax.py @@ -95,8 +95,8 @@ class _FunctionalAdamax(object): exp_avgs, exp_infs, state_steps, - self.defaults['eps'], - self.defaults['beta1'], - self.defaults['beta2'], - self.defaults['lr'], - self.defaults['weight_decay']) + eps=self.defaults['eps'], + beta1=self.defaults['beta1'], + beta2=self.defaults['beta2'], + lr=self.defaults['lr'], + weight_decay=self.defaults['weight_decay']) diff --git a/torch/distributed/optim/functional_adamw.py b/torch/distributed/optim/functional_adamw.py index 682f04d383d..5c1a27d5e6a 100644 --- a/torch/distributed/optim/functional_adamw.py +++ b/torch/distributed/optim/functional_adamw.py @@ -105,9 +105,9 @@ class _FunctionalAdamW(object): exp_avg_sqs, max_exp_avg_sqs, state_steps, - self.amsgrad, - self.defaults['beta1'], - self.defaults['beta2'], - self.defaults['lr'], - self.defaults['weight_decay'], - self.defaults['eps']) + amsgrad=self.amsgrad, + beta1=self.defaults['beta1'], + beta2=self.defaults['beta2'], + lr=self.defaults['lr'], + weight_decay=self.defaults['weight_decay'], + eps=self.defaults['eps']) diff --git a/torch/distributed/optim/functional_rmsprop.py b/torch/distributed/optim/functional_rmsprop.py index abd8d9d6936..b9a2db0c734 100644 --- a/torch/distributed/optim/functional_rmsprop.py +++ b/torch/distributed/optim/functional_rmsprop.py @@ -91,9 +91,9 @@ class _FunctionalRMSprop(object): square_avgs, grad_avgs, momentum_buffer_list, - lr, - alpha, - eps, - weight_decay, - momentum, - self.centered) + lr=lr, + alpha=alpha, + eps=eps, + weight_decay=weight_decay, + momentum=momentum, + centered=self.centered) diff --git a/torch/distributed/optim/functional_rprop.py b/torch/distributed/optim/functional_rprop.py index 93f6e08d231..274e7ad43ad 100644 --- a/torch/distributed/optim/functional_rprop.py +++ b/torch/distributed/optim/functional_rprop.py @@ -75,7 +75,7 @@ class _FunctionalRprop(object): grads, prevs, step_sizes, - step_size_min, - step_size_max, - etaminus, - etaplus) + step_size_min=step_size_min, + step_size_max=step_size_max, + etaminus=etaminus, + etaplus=etaplus) diff --git a/torch/distributed/optim/functional_sgd.py b/torch/distributed/optim/functional_sgd.py index fb4bade546c..343988fdac6 100644 --- a/torch/distributed/optim/functional_sgd.py +++ b/torch/distributed/optim/functional_sgd.py @@ -73,11 +73,11 @@ class _FunctionalSGD(object): F.sgd(params, grads, momentum_buffer_list, - weight_decay, - momentum, - lr, - dampening, - self.nesterov) + weight_decay=weight_decay, + momentum=momentum, + lr=lr, + dampening=dampening, + nesterov=self.nesterov) # update momentum_buffers in state for i, p in enumerate(params): diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py index 0e03f3594bd..5b67aecb761 100644 --- a/torch/optim/_functional.py +++ b/torch/optim/_functional.py @@ -17,6 +17,7 @@ def adagrad(params: List[Tensor], grads: List[Tensor], state_sums: List[Tensor], state_steps: List[int], + *, lr: float, weight_decay: float, lr_decay: float, @@ -56,6 +57,7 @@ def adam(params: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[int], + *, amsgrad: bool, beta1: float, beta2: float, @@ -102,6 +104,7 @@ def adamw(params: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[int], + *, amsgrad: bool, beta1: float, beta2: float, @@ -143,6 +146,7 @@ def adamw(params: List[Tensor], def sgd(params: List[Tensor], d_p_list: List[Tensor], momentum_buffer_list: List[Optional[Tensor]], + *, weight_decay: float, momentum: float, lr: float, @@ -180,6 +184,7 @@ def adadelta(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], acc_deltas: List[Tensor], + *, lr: float, rho: float, eps: float, @@ -205,6 +210,7 @@ def rmsprop(params: List[Tensor], square_avgs: List[Tensor], grad_avgs: List[Tensor], momentum_buffer_list: List[Tensor], + *, lr: float, alpha: float, eps: float, @@ -244,6 +250,7 @@ def rprop(params: List[Tensor], grads: List[Tensor], prevs: List[Tensor], step_sizes: List[Tensor], + *, step_size_min: float, step_size_max: float, etaminus: float, @@ -282,6 +289,7 @@ def adamax(params: List[Tensor], exp_avgs: List[Tensor], exp_infs: List[Tensor], state_steps: List[int], + *, eps: float, beta1: float, beta2: float, diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py index 598349c8163..bd499ed0348 100644 --- a/torch/optim/adadelta.py +++ b/torch/optim/adadelta.py @@ -81,9 +81,9 @@ class Adadelta(Optimizer): grads, square_avgs, acc_deltas, - lr, - rho, - eps, - weight_decay) + lr=lr, + rho=rho, + eps=eps, + weight_decay=weight_decay) return loss diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py index af846858c77..0ef4019fc69 100644 --- a/torch/optim/adagrad.py +++ b/torch/optim/adagrad.py @@ -84,9 +84,9 @@ class Adagrad(Optimizer): grads, state_sums, state_steps, - group['lr'], - group['weight_decay'], - group['lr_decay'], - group['eps']) + lr=group['lr'], + weight_decay=group['weight_decay'], + lr_decay=group['lr_decay'], + eps=group['eps']) return loss diff --git a/torch/optim/adam.py b/torch/optim/adam.py index 607c39b1d92..d7313be75f8 100644 --- a/torch/optim/adam.py +++ b/torch/optim/adam.py @@ -110,10 +110,10 @@ class Adam(Optimizer): exp_avg_sqs, max_exp_avg_sqs, state_steps, - group['amsgrad'], - beta1, - beta2, - group['lr'], - group['weight_decay'], - group['eps']) + amsgrad=group['amsgrad'], + beta1=beta1, + beta2=beta2, + lr=group['lr'], + weight_decay=group['weight_decay'], + eps=group['eps']) return loss diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py index e5591f5c158..4cb71c64d6e 100644 --- a/torch/optim/adamax.py +++ b/torch/optim/adamax.py @@ -89,10 +89,10 @@ class Adamax(Optimizer): exp_avgs, exp_infs, state_steps, - eps, - beta1, - beta2, - lr, - weight_decay) + eps=eps, + beta1=beta1, + beta2=beta2, + lr=lr, + weight_decay=weight_decay) return loss diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py index 023a75cb6d7..d2ea738e4f5 100644 --- a/torch/optim/adamw.py +++ b/torch/optim/adamw.py @@ -113,11 +113,11 @@ class AdamW(Optimizer): exp_avg_sqs, max_exp_avg_sqs, state_steps, - amsgrad, - beta1, - beta2, - group['lr'], - group['weight_decay'], - group['eps']) + amsgrad=amsgrad, + beta1=beta1, + beta2=beta2, + lr=group['lr'], + weight_decay=group['weight_decay'], + eps=group['eps']) return loss diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py index 940e3d226ff..4aab0b3116f 100644 --- a/torch/optim/rmsprop.py +++ b/torch/optim/rmsprop.py @@ -108,11 +108,11 @@ class RMSprop(Optimizer): square_avgs, grad_avgs, momentum_buffer_list, - group['lr'], - group['alpha'], - group['eps'], - group['weight_decay'], - group['momentum'], - group['centered']) + lr=group['lr'], + alpha=group['alpha'], + eps=group['eps'], + weight_decay=group['weight_decay'], + momentum=group['momentum'], + centered=group['centered']) return loss diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py index 0b71ec29174..fb82fb39493 100644 --- a/torch/optim/rprop.py +++ b/torch/optim/rprop.py @@ -74,9 +74,9 @@ class Rprop(Optimizer): grads, prevs, step_sizes, - step_size_min, - step_size_max, - etaminus, - etaplus) + step_size_min=step_size_min, + step_size_max=step_size_max, + etaminus=etaminus, + etaplus=etaplus) return loss diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py index 772f636141f..a7a67ffe825 100644 --- a/torch/optim/sgd.py +++ b/torch/optim/sgd.py @@ -110,11 +110,11 @@ class SGD(Optimizer): F.sgd(params_with_grad, d_p_list, momentum_buffer_list, - weight_decay, - momentum, - lr, - dampening, - nesterov) + weight_decay=weight_decay, + momentum=momentum, + lr=lr, + dampening=dampening, + nesterov=nesterov) # update momentum_buffers in state for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):