From 88e1be9ff5e04b7688efa44951f845b7daf5717f Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Sun, 23 May 2021 13:13:11 +0200
Subject: [PATCH] Documentation update (#450)

* Update migration guide

* Add sanity check

* Removed parameter ``channels_last`` from ``is_image_space``

* Pin docutils

* Clarify callback `save_freq` definition

* Update docs/misc/changelog.rst

* Update docs/misc/changelog.rst

* Fix typos

Co-authored-by: Anssi <kaneran21@hotmail.com>
---
 docs/conda_env.yml                        |  3 +++
 docs/guide/callbacks.rst                  |  9 ++++++++-
 docs/guide/migration.rst                  |  3 ++-
 docs/misc/changelog.rst                   |  8 +++++++-
 stable_baselines3/common/callbacks.py     | 17 +++++++++++++++--
 stable_baselines3/common/preprocessing.py | 11 ++++-------
 stable_baselines3/common/torch_layers.py  |  2 +-
 stable_baselines3/ppo/ppo.py              |  9 ++++++++-
 tests/test_cnn.py                         |  7 ++++++-
 9 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/docs/conda_env.yml b/docs/conda_env.yml
index 9ea054e..290b499 100644
--- a/docs/conda_env.yml
+++ b/docs/conda_env.yml
@@ -15,3 +15,6 @@ dependencies:
     - numpy
     - matplotlib
     - sphinx_autodoc_typehints
+    # Tmp fix, docutils==0.17 breaks rtd theme
+    # See https://github.com/readthedocs/sphinx_rtd_theme/issues/1115
+    - docutils==0.16
diff --git a/docs/guide/callbacks.rst b/docs/guide/callbacks.rst
index f5d9d02..810e367 100644
--- a/docs/guide/callbacks.rst
+++ b/docs/guide/callbacks.rst
@@ -155,9 +155,16 @@ Stable Baselines provides you with a set of common callbacks for:
 CheckpointCallback
 ^^^^^^^^^^^^^^^^^^
 
-Callback for saving a model every ``save_freq`` steps, you must specify a log folder (``save_path``)
+Callback for saving a model every ``save_freq`` calls to ``env.step()``, you must specify a log folder (``save_path``)
 and optionally a prefix for the checkpoints (``rl_model`` by default).
 
+.. warning::
+
+  When using multiple environments, each call to  ``env.step()`` will effectively correspond to ``n_envs`` steps.
+  If you want the ``save_freq`` to be similar when using different number of environments,
+  you need to account for it using ``save_freq = max(save_freq // n_envs, 1)``.
+  The same goes for the other callbacks.
+
 
 .. code-block:: python
 
diff --git a/docs/guide/migration.rst b/docs/guide/migration.rst
index 9b25f95..c9c15cd 100644
--- a/docs/guide/migration.rst
+++ b/docs/guide/migration.rst
@@ -46,7 +46,8 @@ Breaking Changes
 
 - SB3 requires python 3.6+ (instead of python 3.5+ for SB2)
 - Dropped MPI support
-- Dropped layer normalized policies (e.g. ``LnMlpPolicy``)
+- Dropped layer normalized policies (``MlpLnLstmPolicy``, ``CnnLnLstmPolicy``)
+- LSTM policies (```MlpLstmPolicy```, ```CnnLstmPolicy```) are not supported for the time being
 - Dropped parameter noise for DDPG and DQN
 - PPO is now closer to the original implementation (no clipping of the value function by default), cf PPO section below
 - Orthogonal initialization is only used by A2C/PPO
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 6173221..0794a3f 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -27,6 +27,7 @@ Breaking Changes:
 
 - Updated the KL Divergence estimator in the PPO algorithm to be positive definite and have lower variance (@09tangriro)
 - Updated the KL Divergence check in the PPO algorithm to be before the gradient update step rather than after end of epoch (@09tangriro)
+- Removed parameter ``channels_last`` from ``is_image_space`` as it can be inferred.
 
 New Features:
 ^^^^^^^^^^^^^
@@ -62,6 +63,7 @@ Others:
 - Added Code of Conduct
 - Added tests for GAE and lambda return computation
 - Updated distribution entropy test (thanks @09tangriro)
+- Added sanity check ``batch_size > 1`` in PPO to avoid NaN in advantage normalization
 
 Documentation:
 ^^^^^^^^^^^^^^
@@ -75,6 +77,10 @@ Documentation:
 - Added example for using ``ProcgenEnv``
 - Added note about advanced custom policy example for off-policy algorithms
 - Fixed DQN unicode checkmarks
+- Updated migration guide (@juancroldan)
+- Pinned ``docutils==0.16`` to avoid issue with rtd theme
+- Clarified callback ``save_freq`` definition
+
 
 Release 1.0 (2021-03-15)
 ------------------------
@@ -690,4 +696,4 @@ And all the contributors:
 @tirafesi @blurLake @koulakis @joeljosephjin @shwang @rk37 @andyshih12 @RaphaelWag @xicocaio
 @diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @SwamyDev @wmmc88 @megan-klaiber @thisray
 @tfederico @hn2 @LucasAlegre @AptX395 @zampanteymedio @JadenTravnik @decodyng @ardabbour @lorenz-h @mschweizer @lorepieri8 @vwxyzjn
-@ShangqunYu @PierreExeter @JacopoPan @ltbd78 @tom-doerr @Atlis @liusida @09tangriro @amy12xx
+@ShangqunYu @PierreExeter @JacopoPan @ltbd78 @tom-doerr @Atlis @liusida @09tangriro @amy12xx @juancroldan
diff --git a/stable_baselines3/common/callbacks.py b/stable_baselines3/common/callbacks.py
index fa806a8..4684383 100644
--- a/stable_baselines3/common/callbacks.py
+++ b/stable_baselines3/common/callbacks.py
@@ -212,7 +212,14 @@ class CallbackList(BaseCallback):
 
 class CheckpointCallback(BaseCallback):
     """
-    Callback for saving a model every ``save_freq`` steps
+    Callback for saving a model every ``save_freq`` calls
+    to ``env.step()``.
+
+    .. warning::
+
+      When using multiple environments, each call to  ``env.step()``
+      will effectively correspond to ``n_envs`` steps.
+      To account for that, you can use ``save_freq = max(save_freq // n_envs, 1)``
 
     :param save_freq:
     :param save_path: Path to the folder where the model will be saved.
@@ -262,11 +269,17 @@ class EvalCallback(EventCallback):
     """
     Callback for evaluating an agent.
 
+    .. warning::
+
+      When using multiple environments, each call to  ``env.step()``
+      will effectively correspond to ``n_envs`` steps.
+      To account for that, you can use ``eval_freq = max(eval_freq // n_envs, 1)``
+
     :param eval_env: The environment used for initialization
     :param callback_on_new_best: Callback to trigger
         when there is a new best model according to the ``mean_reward``
     :param n_eval_episodes: The number of episodes to test the agent
-    :param eval_freq: Evaluate the agent every eval_freq call of the callback.
+    :param eval_freq: Evaluate the agent every ``eval_freq`` call of the callback.
     :param log_path: Path to a folder where the evaluations (``evaluations.npz``)
         will be saved. It will be updated at each evaluation.
     :param best_model_save_path: Path to a folder where the best model
diff --git a/stable_baselines3/common/preprocessing.py b/stable_baselines3/common/preprocessing.py
index 7aaeb12..01422aa 100644
--- a/stable_baselines3/common/preprocessing.py
+++ b/stable_baselines3/common/preprocessing.py
@@ -26,19 +26,16 @@ def is_image_space_channels_first(observation_space: spaces.Box) -> bool:
 
 def is_image_space(
     observation_space: spaces.Space,
-    channels_last: bool = True,
     check_channels: bool = False,
 ) -> bool:
     """
     Check if a observation space has the shape, limits and dtype
     of a valid image.
-    The check is conservative, so that it returns False
-    if there is a doubt.
+    The check is conservative, so that it returns False if there is a doubt.
 
     Valid images: RGB, RGBD, GrayScale with values in [0, 255]
 
     :param observation_space:
-    :param channels_last:
     :param check_channels: Whether to do or not the check for the number of channels.
         e.g., with frame-stacking, the observation space may have more channels than expected.
     :return:
@@ -56,10 +53,10 @@ def is_image_space(
         if not check_channels:
             return True
         # Check the number of channels
-        if channels_last:
-            n_channels = observation_space.shape[-1]
-        else:
+        if is_image_space_channels_first(observation_space):
             n_channels = observation_space.shape[0]
+        else:
+            n_channels = observation_space.shape[-1]
         # RGB, RGBD, GrayScale
         return n_channels in [1, 3, 4]
     return False
diff --git a/stable_baselines3/common/torch_layers.py b/stable_baselines3/common/torch_layers.py
index 16088f3..71b647b 100644
--- a/stable_baselines3/common/torch_layers.py
+++ b/stable_baselines3/common/torch_layers.py
@@ -64,7 +64,7 @@ class NatureCNN(BaseFeaturesExtractor):
         super(NatureCNN, self).__init__(observation_space, features_dim)
         # We assume CxHxW images (channels first)
         # Re-ordering will be done by pre-preprocessing or wrapper
-        assert is_image_space(observation_space), (
+        assert is_image_space(observation_space, check_channels=False), (
             "You should use NatureCNN "
             f"only with images not with {observation_space}\n"
             "(you are probably using `CnnPolicy` instead of `MlpPolicy` or `MultiInputPolicy`)\n"
diff --git a/stable_baselines3/ppo/ppo.py b/stable_baselines3/ppo/ppo.py
index 34737c7..660eb32 100644
--- a/stable_baselines3/ppo/ppo.py
+++ b/stable_baselines3/ppo/ppo.py
@@ -118,6 +118,13 @@ class PPO(OnPolicyAlgorithm):
                 spaces.MultiBinary,
             ),
         )
+
+        # Sanity check, otherwise it will lead to noisy gradient and NaN
+        # because of the advantage normalization
+        assert (
+            batch_size > 1
+        ), "`batch_size` must be greater than 1. See https://github.com/DLR-RM/stable-baselines3/issues/440"
+
         if self.env is not None:
             # Check that `n_steps * n_envs > 1` to avoid NaN
             # when doing advantage normalization
@@ -133,7 +140,7 @@ class PPO(OnPolicyAlgorithm):
                     f" but because the `RolloutBuffer` is of size `n_steps * n_envs = {buffer_size}`,"
                     f" after every {untruncated_batches} untruncated mini-batches,"
                     f" there will be a truncated mini-batch of size {buffer_size % batch_size}\n"
-                    f"We recommend using a `batch_size` that is a multiple of `n_steps * n_envs`.\n"
+                    f"We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.\n"
                     f"Info: (n_steps={self.n_steps} and n_envs={self.env.num_envs})"
                 )
         self.batch_size = batch_size
diff --git a/tests/test_cnn.py b/tests/test_cnn.py
index 5cac58e..8ec33fb 100644
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@@ -247,7 +247,12 @@ def test_image_space_checks():
     assert not is_image_space(not_image_space)
 
     an_image_space = spaces.Box(0, 255, shape=(10, 10, 3), dtype=np.uint8)
-    assert is_image_space(an_image_space)
+    assert is_image_space(an_image_space, check_channels=False)
+    assert is_image_space(an_image_space, check_channels=True)
+
+    channel_first_image_space = spaces.Box(0, 255, shape=(3, 10, 10), dtype=np.uint8)
+    assert is_image_space(channel_first_image_space, check_channels=False)
+    assert is_image_space(channel_first_image_space, check_channels=True)
 
     an_image_space_with_odd_channels = spaces.Box(0, 255, shape=(10, 10, 5), dtype=np.uint8)
     assert is_image_space(an_image_space_with_odd_channels)