Merge branch 'master' into feat/multienv-off-policy

This commit is contained in:
Antonin RAFFIN 2021-05-23 13:27:44 +02:00 committed by GitHub
commit a3e1ea4a32
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 54 additions and 15 deletions

View file

@ -15,3 +15,6 @@ dependencies:
- numpy
- matplotlib
- sphinx_autodoc_typehints
# Tmp fix, docutils==0.17 breaks rtd theme
# See https://github.com/readthedocs/sphinx_rtd_theme/issues/1115
- docutils==0.16

View file

@ -155,9 +155,16 @@ Stable Baselines provides you with a set of common callbacks for:
CheckpointCallback
^^^^^^^^^^^^^^^^^^
Callback for saving a model every ``save_freq`` steps, you must specify a log folder (``save_path``)
Callback for saving a model every ``save_freq`` calls to ``env.step()``, you must specify a log folder (``save_path``)
and optionally a prefix for the checkpoints (``rl_model`` by default).
.. warning::
When using multiple environments, each call to ``env.step()`` will effectively correspond to ``n_envs`` steps.
If you want the ``save_freq`` to be similar when using different number of environments,
you need to account for it using ``save_freq = max(save_freq // n_envs, 1)``.
The same goes for the other callbacks.
.. code-block:: python

View file

@ -46,7 +46,8 @@ Breaking Changes
- SB3 requires python 3.6+ (instead of python 3.5+ for SB2)
- Dropped MPI support
- Dropped layer normalized policies (e.g. ``LnMlpPolicy``)
- Dropped layer normalized policies (``MlpLnLstmPolicy``, ``CnnLnLstmPolicy``)
- LSTM policies (```MlpLstmPolicy```, ```CnnLstmPolicy```) are not supported for the time being
- Dropped parameter noise for DDPG and DQN
- PPO is now closer to the original implementation (no clipping of the value function by default), cf PPO section below
- Orthogonal initialization is only used by A2C/PPO

View file

@ -27,6 +27,7 @@ Breaking Changes:
- Updated the KL Divergence estimator in the PPO algorithm to be positive definite and have lower variance (@09tangriro)
- Updated the KL Divergence check in the PPO algorithm to be before the gradient update step rather than after end of epoch (@09tangriro)
- Removed parameter ``channels_last`` from ``is_image_space`` as it can be inferred.
New Features:
^^^^^^^^^^^^^
@ -63,6 +64,7 @@ Others:
- Added Code of Conduct
- Added tests for GAE and lambda return computation
- Updated distribution entropy test (thanks @09tangriro)
- Added sanity check ``batch_size > 1`` in PPO to avoid NaN in advantage normalization
Documentation:
^^^^^^^^^^^^^^
@ -76,6 +78,10 @@ Documentation:
- Added example for using ``ProcgenEnv``
- Added note about advanced custom policy example for off-policy algorithms
- Fixed DQN unicode checkmarks
- Updated migration guide (@juancroldan)
- Pinned ``docutils==0.16`` to avoid issue with rtd theme
- Clarified callback ``save_freq`` definition
Release 1.0 (2021-03-15)
------------------------
@ -691,4 +697,4 @@ And all the contributors:
@tirafesi @blurLake @koulakis @joeljosephjin @shwang @rk37 @andyshih12 @RaphaelWag @xicocaio
@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @SwamyDev @wmmc88 @megan-klaiber @thisray
@tfederico @hn2 @LucasAlegre @AptX395 @zampanteymedio @JadenTravnik @decodyng @ardabbour @lorenz-h @mschweizer @lorepieri8 @vwxyzjn
@ShangqunYu @PierreExeter @JacopoPan @ltbd78 @tom-doerr @Atlis @liusida @09tangriro @amy12xx
@ShangqunYu @PierreExeter @JacopoPan @ltbd78 @tom-doerr @Atlis @liusida @09tangriro @amy12xx @juancroldan

View file

@ -212,7 +212,14 @@ class CallbackList(BaseCallback):
class CheckpointCallback(BaseCallback):
"""
Callback for saving a model every ``save_freq`` steps
Callback for saving a model every ``save_freq`` calls
to ``env.step()``.
.. warning::
When using multiple environments, each call to ``env.step()``
will effectively correspond to ``n_envs`` steps.
To account for that, you can use ``save_freq = max(save_freq // n_envs, 1)``
:param save_freq:
:param save_path: Path to the folder where the model will be saved.
@ -262,11 +269,17 @@ class EvalCallback(EventCallback):
"""
Callback for evaluating an agent.
.. warning::
When using multiple environments, each call to ``env.step()``
will effectively correspond to ``n_envs`` steps.
To account for that, you can use ``eval_freq = max(eval_freq // n_envs, 1)``
:param eval_env: The environment used for initialization
:param callback_on_new_best: Callback to trigger
when there is a new best model according to the ``mean_reward``
:param n_eval_episodes: The number of episodes to test the agent
:param eval_freq: Evaluate the agent every eval_freq call of the callback.
:param eval_freq: Evaluate the agent every ``eval_freq`` call of the callback.
:param log_path: Path to a folder where the evaluations (``evaluations.npz``)
will be saved. It will be updated at each evaluation.
:param best_model_save_path: Path to a folder where the best model

View file

@ -26,19 +26,16 @@ def is_image_space_channels_first(observation_space: spaces.Box) -> bool:
def is_image_space(
observation_space: spaces.Space,
channels_last: bool = True,
check_channels: bool = False,
) -> bool:
"""
Check if a observation space has the shape, limits and dtype
of a valid image.
The check is conservative, so that it returns False
if there is a doubt.
The check is conservative, so that it returns False if there is a doubt.
Valid images: RGB, RGBD, GrayScale with values in [0, 255]
:param observation_space:
:param channels_last:
:param check_channels: Whether to do or not the check for the number of channels.
e.g., with frame-stacking, the observation space may have more channels than expected.
:return:
@ -56,10 +53,10 @@ def is_image_space(
if not check_channels:
return True
# Check the number of channels
if channels_last:
n_channels = observation_space.shape[-1]
else:
if is_image_space_channels_first(observation_space):
n_channels = observation_space.shape[0]
else:
n_channels = observation_space.shape[-1]
# RGB, RGBD, GrayScale
return n_channels in [1, 3, 4]
return False

View file

@ -64,7 +64,7 @@ class NatureCNN(BaseFeaturesExtractor):
super(NatureCNN, self).__init__(observation_space, features_dim)
# We assume CxHxW images (channels first)
# Re-ordering will be done by pre-preprocessing or wrapper
assert is_image_space(observation_space), (
assert is_image_space(observation_space, check_channels=False), (
"You should use NatureCNN "
f"only with images not with {observation_space}\n"
"(you are probably using `CnnPolicy` instead of `MlpPolicy` or `MultiInputPolicy`)\n"

View file

@ -118,6 +118,13 @@ class PPO(OnPolicyAlgorithm):
spaces.MultiBinary,
),
)
# Sanity check, otherwise it will lead to noisy gradient and NaN
# because of the advantage normalization
assert (
batch_size > 1
), "`batch_size` must be greater than 1. See https://github.com/DLR-RM/stable-baselines3/issues/440"
if self.env is not None:
# Check that `n_steps * n_envs > 1` to avoid NaN
# when doing advantage normalization
@ -133,7 +140,7 @@ class PPO(OnPolicyAlgorithm):
f" but because the `RolloutBuffer` is of size `n_steps * n_envs = {buffer_size}`,"
f" after every {untruncated_batches} untruncated mini-batches,"
f" there will be a truncated mini-batch of size {buffer_size % batch_size}\n"
f"We recommend using a `batch_size` that is a multiple of `n_steps * n_envs`.\n"
f"We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.\n"
f"Info: (n_steps={self.n_steps} and n_envs={self.env.num_envs})"
)
self.batch_size = batch_size

View file

@ -247,7 +247,12 @@ def test_image_space_checks():
assert not is_image_space(not_image_space)
an_image_space = spaces.Box(0, 255, shape=(10, 10, 3), dtype=np.uint8)
assert is_image_space(an_image_space)
assert is_image_space(an_image_space, check_channels=False)
assert is_image_space(an_image_space, check_channels=True)
channel_first_image_space = spaces.Box(0, 255, shape=(3, 10, 10), dtype=np.uint8)
assert is_image_space(channel_first_image_space, check_channels=False)
assert is_image_space(channel_first_image_space, check_channels=True)
an_image_space_with_odd_channels = spaces.Box(0, 255, shape=(10, 10, 5), dtype=np.uint8)
assert is_image_space(an_image_space_with_odd_channels)