Merge branch 'master' into feat/multienv-off-policy

2026-06-22 02:30:39 +00:00 · 2021-05-23 13:27:44 +02:00 · 2021-05-23 13:27:44 +02:00 · a3e1ea4a32
commit a3e1ea4a32
parent 53a12ff936 88e1be9ff5
9 changed files with 54 additions and 15 deletions
--- a/docs/conda_env.yml
+++ b/docs/conda_env.yml
@ -15,3 +15,6 @@ dependencies:
    - numpy
    - matplotlib
    - sphinx_autodoc_typehints
+    # Tmp fix, docutils==0.17 breaks rtd theme
+    # See https://github.com/readthedocs/sphinx_rtd_theme/issues/1115
+    - docutils==0.16
--- a/docs/guide/callbacks.rst
+++ b/docs/guide/callbacks.rst
@ -155,9 +155,16 @@ Stable Baselines provides you with a set of common callbacks for:
 CheckpointCallback
 ^^^^^^^^^^^^^^^^^^

-Callback for saving a model every ``save_freq`` steps, you must specify a log folder (``save_path``)
+Callback for saving a model every ``save_freq`` calls to ``env.step()``, you must specify a log folder (``save_path``)
 and optionally a prefix for the checkpoints (``rl_model`` by default).

+.. warning::
+
+  When using multiple environments, each call to  ``env.step()`` will effectively correspond to ``n_envs`` steps.
+  If you want the ``save_freq`` to be similar when using different number of environments,
+  you need to account for it using ``save_freq = max(save_freq // n_envs, 1)``.
+  The same goes for the other callbacks.
+

 .. code-block:: python

--- a/docs/guide/migration.rst
+++ b/docs/guide/migration.rst
@ -46,7 +46,8 @@ Breaking Changes

 - SB3 requires python 3.6+ (instead of python 3.5+ for SB2)
 - Dropped MPI support
- Dropped layer normalized policies (e.g. ``LnMlpPolicy``)
+- Dropped layer normalized policies (``MlpLnLstmPolicy``, ``CnnLnLstmPolicy``)
+- LSTM policies (```MlpLstmPolicy```, ```CnnLstmPolicy```) are not supported for the time being
 - Dropped parameter noise for DDPG and DQN
 - PPO is now closer to the original implementation (no clipping of the value function by default), cf PPO section below
 - Orthogonal initialization is only used by A2C/PPO
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@ -27,6 +27,7 @@ Breaking Changes:

 - Updated the KL Divergence estimator in the PPO algorithm to be positive definite and have lower variance (@09tangriro)
 - Updated the KL Divergence check in the PPO algorithm to be before the gradient update step rather than after end of epoch (@09tangriro)
+- Removed parameter ``channels_last`` from ``is_image_space`` as it can be inferred.

 New Features:
 ^^^^^^^^^^^^^
@ -63,6 +64,7 @@ Others:
 - Added Code of Conduct
 - Added tests for GAE and lambda return computation
 - Updated distribution entropy test (thanks @09tangriro)
+- Added sanity check ``batch_size > 1`` in PPO to avoid NaN in advantage normalization

 Documentation:
 ^^^^^^^^^^^^^^
@ -76,6 +78,10 @@ Documentation:
 - Added example for using ``ProcgenEnv``
 - Added note about advanced custom policy example for off-policy algorithms
 - Fixed DQN unicode checkmarks
+- Updated migration guide (@juancroldan)
+- Pinned ``docutils==0.16`` to avoid issue with rtd theme
+- Clarified callback ``save_freq`` definition
+

 Release 1.0 (2021-03-15)
 ------------------------
@ -691,4 +697,4 @@ And all the contributors:
@tirafesi @blurLake @koulakis @joeljosephjin @shwang @rk37 @andyshih12 @RaphaelWag @xicocaio
@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @SwamyDev @wmmc88 @megan-klaiber @thisray
@tfederico @hn2 @LucasAlegre @AptX395 @zampanteymedio @JadenTravnik @decodyng @ardabbour @lorenz-h @mschweizer @lorepieri8 @vwxyzjn
-@ShangqunYu @PierreExeter @JacopoPan @ltbd78 @tom-doerr @Atlis @liusida @09tangriro @amy12xx
+@ShangqunYu @PierreExeter @JacopoPan @ltbd78 @tom-doerr @Atlis @liusida @09tangriro @amy12xx @juancroldan
--- a/stable_baselines3/common/callbacks.py
+++ b/stable_baselines3/common/callbacks.py
@ -212,7 +212,14 @@ class CallbackList(BaseCallback):

 class CheckpointCallback(BaseCallback):
    """
-    Callback for saving a model every ``save_freq`` steps
+    Callback for saving a model every ``save_freq`` calls
+    to ``env.step()``.
+
+    .. warning::
+
+      When using multiple environments, each call to  ``env.step()``
+      will effectively correspond to ``n_envs`` steps.
+      To account for that, you can use ``save_freq = max(save_freq // n_envs, 1)``

    :param save_freq:
    :param save_path: Path to the folder where the model will be saved.
@ -262,11 +269,17 @@ class EvalCallback(EventCallback):
    """
    Callback for evaluating an agent.

+    .. warning::
+
+      When using multiple environments, each call to  ``env.step()``
+      will effectively correspond to ``n_envs`` steps.
+      To account for that, you can use ``eval_freq = max(eval_freq // n_envs, 1)``
+
    :param eval_env: The environment used for initialization
    :param callback_on_new_best: Callback to trigger
        when there is a new best model according to the ``mean_reward``
    :param n_eval_episodes: The number of episodes to test the agent
-    :param eval_freq: Evaluate the agent every eval_freq call of the callback.
+    :param eval_freq: Evaluate the agent every ``eval_freq`` call of the callback.
    :param log_path: Path to a folder where the evaluations (``evaluations.npz``)
        will be saved. It will be updated at each evaluation.
    :param best_model_save_path: Path to a folder where the best model
--- a/stable_baselines3/common/preprocessing.py
+++ b/stable_baselines3/common/preprocessing.py
@ -26,19 +26,16 @@ def is_image_space_channels_first(observation_space: spaces.Box) -> bool:

 def is_image_space(
    observation_space: spaces.Space,
-    channels_last: bool = True,
    check_channels: bool = False,
 ) -> bool:
    """
    Check if a observation space has the shape, limits and dtype
    of a valid image.
-    The check is conservative, so that it returns False
-    if there is a doubt.
+    The check is conservative, so that it returns False if there is a doubt.

    Valid images: RGB, RGBD, GrayScale with values in [0, 255]

    :param observation_space:
-    :param channels_last:
    :param check_channels: Whether to do or not the check for the number of channels.
        e.g., with frame-stacking, the observation space may have more channels than expected.
    :return:
@ -56,10 +53,10 @@ def is_image_space(
        if not check_channels:
            return True
        # Check the number of channels
-        if channels_last:
-            n_channels = observation_space.shape[-1]
-        else:
+        if is_image_space_channels_first(observation_space):
            n_channels = observation_space.shape[0]
+        else:
+            n_channels = observation_space.shape[-1]
        # RGB, RGBD, GrayScale
        return n_channels in [1, 3, 4]
    return False
--- a/stable_baselines3/common/torch_layers.py
+++ b/stable_baselines3/common/torch_layers.py
@ -64,7 +64,7 @@ class NatureCNN(BaseFeaturesExtractor):
        super(NatureCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
-        assert is_image_space(observation_space), (
+        assert is_image_space(observation_space, check_channels=False), (
            "You should use NatureCNN "
            f"only with images not with {observation_space}\n"
            "(you are probably using `CnnPolicy` instead of `MlpPolicy` or `MultiInputPolicy`)\n"
--- a/stable_baselines3/ppo/ppo.py
+++ b/stable_baselines3/ppo/ppo.py
@ -118,6 +118,13 @@ class PPO(OnPolicyAlgorithm):
                spaces.MultiBinary,
            ),
        )
+
+        # Sanity check, otherwise it will lead to noisy gradient and NaN
+        # because of the advantage normalization
+        assert (
+            batch_size > 1
+        ), "`batch_size` must be greater than 1. See https://github.com/DLR-RM/stable-baselines3/issues/440"
+
        if self.env is not None:
            # Check that `n_steps * n_envs > 1` to avoid NaN
            # when doing advantage normalization
@ -133,7 +140,7 @@ class PPO(OnPolicyAlgorithm):
                    f" but because the `RolloutBuffer` is of size `n_steps * n_envs = {buffer_size}`,"
                    f" after every {untruncated_batches} untruncated mini-batches,"
                    f" there will be a truncated mini-batch of size {buffer_size % batch_size}\n"
-                    f"We recommend using a `batch_size` that is a multiple of `n_steps * n_envs`.\n"
+                    f"We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.\n"
                    f"Info: (n_steps={self.n_steps} and n_envs={self.env.num_envs})"
                )
        self.batch_size = batch_size
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@ -247,7 +247,12 @@ def test_image_space_checks():
    assert not is_image_space(not_image_space)

    an_image_space = spaces.Box(0, 255, shape=(10, 10, 3), dtype=np.uint8)
-    assert is_image_space(an_image_space)
+    assert is_image_space(an_image_space, check_channels=False)
+    assert is_image_space(an_image_space, check_channels=True)
+
+    channel_first_image_space = spaces.Box(0, 255, shape=(3, 10, 10), dtype=np.uint8)
+    assert is_image_space(channel_first_image_space, check_channels=False)
+    assert is_image_space(channel_first_image_space, check_channels=True)

    an_image_space_with_odd_channels = spaces.Box(0, 255, shape=(10, 10, 5), dtype=np.uint8)
    assert is_image_space(an_image_space_with_odd_channels)