From 09e9fc42eb653cacc0b6db203fa83faa0530e297 Mon Sep 17 00:00:00 2001 From: Timo Kaufmann Date: Tue, 12 Oct 2021 13:17:30 +0200 Subject: [PATCH] Use consistent logging keys (#605) * Use a consistent key to log the total timesteps This changes the timestep logging key of on-policy algorithms from `time/total_timesteps` to `time/total timesteps` (note the underscore/space). The off-policy algorithms and the eval callback already use the latter, so this behavior is more consistent. * Use underscores instead of spaces in logging keys Most keys already followed this policy and consistent behavior is friendlier to new users. * Minor edit and bump version Co-authored-by: Antonin Raffin --- docs/misc/changelog.rst | 9 +++++++-- stable_baselines3/common/callbacks.py | 2 +- stable_baselines3/common/off_policy_algorithm.py | 4 ++-- stable_baselines3/dqn/dqn.py | 2 +- stable_baselines3/version.txt | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 211ebe0..904e10e 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -4,7 +4,7 @@ Changelog ========== -Release 1.2.1a2 (WIP) +Release 1.2.1a3 (WIP) --------------------------- @@ -12,6 +12,11 @@ Breaking Changes: ^^^^^^^^^^^^^^^^^ - ``sde_net_arch`` argument in policies is deprecated and will be removed in a future version. - ``_get_latent`` (``ActorCriticPolicy``) was removed +- All logging keys now use underscores instead of spaces (@timokau). Concretely this changes: + + - ``time/total timesteps`` to ``time/total_timesteps`` for off-policy algorithms (PPO and A2C) and the eval callback (on-policy algorithms already used the underscored version), + - ``rollout/exploration rate`` to ``rollout/exploration_rate`` and + - ``rollout/success rate`` to ``rollout/success_rate``. New Features: ^^^^^^^^^^^^^ @@ -788,4 +793,4 @@ And all the contributors: @tirafesi @blurLake @koulakis @joeljosephjin @shwang @rk37 @andyshih12 @RaphaelWag @xicocaio @diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @SwamyDev @wmmc88 @megan-klaiber @thisray @tfederico @hn2 @LucasAlegre @AptX395 @zampanteymedio @JadenTravnik @decodyng @ardabbour @lorenz-h @mschweizer @lorepieri8 @vwxyzjn -@ShangqunYu @PierreExeter @JacopoPan @ltbd78 @tom-doerr @Atlis @liusida @09tangriro @amy12xx @juancroldan @benblack769 @bstee615 @c-rizz @skandermoalla @MihaiAnca13 @davidblom603 @ayeright @cyprienc @wkirgsn @AechPro @CUN-bjy @batu @IljaAvadiev +@ShangqunYu @PierreExeter @JacopoPan @ltbd78 @tom-doerr @Atlis @liusida @09tangriro @amy12xx @juancroldan @benblack769 @bstee615 @c-rizz @skandermoalla @MihaiAnca13 @davidblom603 @ayeright @cyprienc @wkirgsn @AechPro @CUN-bjy @batu @IljaAvadiev @timokau diff --git a/stable_baselines3/common/callbacks.py b/stable_baselines3/common/callbacks.py index 9825347..5f584da 100644 --- a/stable_baselines3/common/callbacks.py +++ b/stable_baselines3/common/callbacks.py @@ -423,7 +423,7 @@ class EvalCallback(EventCallback): self.logger.record("eval/success_rate", success_rate) # Dump log so the evaluation results are printed with the correct timestep - self.logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") + self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") self.logger.dump(self.num_timesteps) if mean_reward > self.best_mean_reward: diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py index fce62e4..c26c0a4 100644 --- a/stable_baselines3/common/off_policy_algorithm.py +++ b/stable_baselines3/common/off_policy_algorithm.py @@ -437,12 +437,12 @@ class OffPolicyAlgorithm(BaseAlgorithm): self.logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) self.logger.record("time/fps", fps) self.logger.record("time/time_elapsed", int(time_elapsed), exclude="tensorboard") - self.logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") + self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") if self.use_sde: self.logger.record("train/std", (self.actor.get_std()).mean().item()) if len(self.ep_success_buffer) > 0: - self.logger.record("rollout/success rate", safe_mean(self.ep_success_buffer)) + self.logger.record("rollout/success_rate", safe_mean(self.ep_success_buffer)) # Pass the number of timesteps for tensorboard self.logger.dump(step=self.num_timesteps) diff --git a/stable_baselines3/dqn/dqn.py b/stable_baselines3/dqn/dqn.py index a99220b..69b2227 100644 --- a/stable_baselines3/dqn/dqn.py +++ b/stable_baselines3/dqn/dqn.py @@ -149,7 +149,7 @@ class DQN(OffPolicyAlgorithm): polyak_update(self.q_net.parameters(), self.q_net_target.parameters(), self.tau) self.exploration_rate = self.exploration_schedule(self._current_progress_remaining) - self.logger.record("rollout/exploration rate", self.exploration_rate) + self.logger.record("rollout/exploration_rate", self.exploration_rate) def train(self, gradient_steps: int, batch_size: int = 100) -> None: # Switch to train mode (this affects batch norm / dropout) diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt index c4baa5c..90ebae4 100644 --- a/stable_baselines3/version.txt +++ b/stable_baselines3/version.txt @@ -1 +1 @@ -1.2.1a2 +1.2.1a3