mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-31 23:28:05 +00:00
Add base doc
This commit is contained in:
parent
94b1267817
commit
d17f29c8ad
29 changed files with 1194 additions and 58 deletions
27
README.md
27
README.md
|
|
@ -35,6 +35,29 @@ These algorithms will make it easier for the research community and industry to
|
|||
|
||||
<!-- | Tensorboard support | :heavy_check_mark: | -->
|
||||
|
||||
### Roadmap to V1.0
|
||||
|
||||
Please look at the issue for more details.
|
||||
Planned features:
|
||||
|
||||
- [ ] DQN (almost ready, currently in testing phase)
|
||||
- [ ] DDPG (you can use its successor TD3 for now)
|
||||
- [ ] HER
|
||||
- [ ] Support for MultiDiscrete and MultiBinary action spaces
|
||||
|
||||
### Planned features (v1.1+)
|
||||
|
||||
- [ ] Full Tensorboard support
|
||||
- [ ] DQN extensions (prioritized replay, double q-learning, ...)
|
||||
- [ ] Support for `Tuple` and `Dict` observation spaces
|
||||
- [ ] Recurrent Policies
|
||||
- [ ] TRPO
|
||||
|
||||
|
||||
## Migration guide
|
||||
|
||||
**TODO: migration guide from Stable-Baselines in the documentation**
|
||||
|
||||
## Documentation
|
||||
|
||||
Documentation is available online: [https://stable-baselines.readthedocs.io/](https://stable-baselines.readthedocs.io/)
|
||||
|
|
@ -102,8 +125,10 @@ model.learn(total_timesteps=10000)
|
|||
obs = env.reset()
|
||||
for i in range(1000):
|
||||
action, _states = model.predict(obs, deterministic=True)
|
||||
obs, rewards, dones, info = env.step(action)
|
||||
obs, reward, done, info = env.step(action)
|
||||
env.render()
|
||||
if done:
|
||||
obs = env.reset()
|
||||
|
||||
env.close()
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
## Stable Baselines Documentation
|
||||
## Stable Baselines3 Documentation
|
||||
|
||||
This folder contains documentation for the RL baselines.
|
||||
|
||||
|
|
|
|||
BIN
docs/_static/img/mistake.png
vendored
Normal file
BIN
docs/_static/img/mistake.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 145 KiB |
26
docs/common/distributions.rst
Normal file
26
docs/common/distributions.rst
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
.. _distributions:
|
||||
|
||||
Probability Distributions
|
||||
=========================
|
||||
|
||||
Probability distributions used for the different action spaces:
|
||||
|
||||
- ``CategoricalDistribution`` -> Discrete
|
||||
- ``DiagGaussianDistribution`` -> Box (continuous actions)
|
||||
- ``StateDependentNoiseDistribution`` -> Box (continuous actions) when ``use_sde=True``
|
||||
|
||||
.. - ``MultiCategoricalDistribution`` -> MultiDiscrete
|
||||
.. - ``BernoulliDistribution`` -> MultiBinary
|
||||
|
||||
The policy networks output parameters for the distributions (named ``flat`` in the methods).
|
||||
Actions are then sampled from those distributions.
|
||||
|
||||
For instance, in the case of discrete actions. The policy network outputs probability
|
||||
of taking each action. The ``CategoricalDistribution`` allows to sample from it,
|
||||
computes the entropy, the log probability (``log_prob``) and backpropagate the gradient.
|
||||
|
||||
In the case of continuous actions, a Gaussian distribution is used. The policy network outputs
|
||||
mean and (log) std of the distribution (assumed to be a ``DiagGaussianDistribution``).
|
||||
|
||||
.. automodule:: stable_baselines3.common.distributions
|
||||
:members:
|
||||
7
docs/common/env_checker.rst
Normal file
7
docs/common/env_checker.rst
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
.. _env_checker:
|
||||
|
||||
Gym Environment Checker
|
||||
========================
|
||||
|
||||
.. automodule:: stable_baselines3.common.env_checker
|
||||
:members:
|
||||
7
docs/common/evaluation.rst
Normal file
7
docs/common/evaluation.rst
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
.. _eval:
|
||||
|
||||
Evaluation Helper
|
||||
=================
|
||||
|
||||
.. automodule:: stable_baselines3.common.evaluation
|
||||
:members:
|
||||
|
|
@ -216,5 +216,5 @@ texinfo_documents = [
|
|||
# }
|
||||
|
||||
# kornia's hack to get rtd builder to install latest pytorch
|
||||
# if 'READTHEDOCS' in os.environ:
|
||||
# os.system('pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html')
|
||||
if on_rtd:
|
||||
os.system('pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html')
|
||||
|
|
|
|||
296
docs/guide/callbacks.rst
Normal file
296
docs/guide/callbacks.rst
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
.. _callbacks:
|
||||
|
||||
Callbacks
|
||||
=========
|
||||
|
||||
A callback is a set of functions that will be called at given stages of the training procedure.
|
||||
You can use callbacks to access internal state of the RL model during training.
|
||||
It allows one to do monitoring, auto saving, model manipulation, progress bars, ...
|
||||
|
||||
|
||||
Custom Callback
|
||||
---------------
|
||||
|
||||
To build a custom callback, you need to create a class that derives from ``BaseCallback``.
|
||||
This will give you access to events (``_on_training_start``, ``_on_step``) and useful variables (like `self.model` for the RL model).
|
||||
|
||||
|
||||
.. You can find two examples of custom callbacks in the documentation: one for saving the best model according to the training reward (see :ref:`Examples <examples>`), and one for logging additional values with Tensorboard (see :ref:`Tensorboard section <tensorboard>`).
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from stable_baselines3.common.callbacks import BaseCallback
|
||||
|
||||
|
||||
class CustomCallback(BaseCallback):
|
||||
"""
|
||||
A custom callback that derives from ``BaseCallback``.
|
||||
|
||||
:param verbose: (int) Verbosity level 0: not output 1: info 2: debug
|
||||
"""
|
||||
def __init__(self, verbose=0):
|
||||
super(CustomCallback, self).__init__(verbose)
|
||||
# Those variables will be accessible in the callback
|
||||
# (they are defined in the base class)
|
||||
# The RL model
|
||||
# self.model = None # type: BaseRLModel
|
||||
# An alias for self.model.get_env(), the environment used for training
|
||||
# self.training_env = None # type: Union[gym.Env, VecEnv, None]
|
||||
# Number of time the callback was called
|
||||
# self.n_calls = 0 # type: int
|
||||
# self.num_timesteps = 0 # type: int
|
||||
# local and global variables
|
||||
# self.locals = None # type: Dict[str, Any]
|
||||
# self.globals = None # type: Dict[str, Any]
|
||||
# The logger object, used to report things in the terminal
|
||||
# self.logger = None # type: logger.Logger
|
||||
# # Sometimes, for event callback, it is useful
|
||||
# # to have access to the parent object
|
||||
# self.parent = None # type: Optional[BaseCallback]
|
||||
|
||||
def _on_training_start(self) -> None:
|
||||
"""
|
||||
This method is called before the first rollout starts.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _on_rollout_start(self) -> None:
|
||||
"""
|
||||
A rollout is the collection of environment interaction
|
||||
using the current policy.
|
||||
This event is triggered before collecting new samples.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _on_step(self) -> bool:
|
||||
"""
|
||||
This method will be called by the model after each call to `env.step()`.
|
||||
|
||||
For child callback (of an `EventCallback`), this will be called
|
||||
when the event is triggered.
|
||||
|
||||
:return: (bool) If the callback returns False, training is aborted early.
|
||||
"""
|
||||
return True
|
||||
|
||||
def _on_rollout_end(self) -> None:
|
||||
"""
|
||||
This event is triggered before updating the policy.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _on_training_end(self) -> None:
|
||||
"""
|
||||
This event is triggered before exiting the `learn()` method.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
.. note::
|
||||
``self.num_timesteps`` corresponds to the total number of steps taken in the environment, i.e., it is the number of environments multiplied by the number of time ``env.step()`` was called
|
||||
|
||||
For the other algorithms, ``self.num_timesteps`` is incremented by ``n_envs`` (number of environments) after each call to ``env.step()``
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
For off-policy algorithms like SAC, DDPG, TD3 or DQN, the notion of ``rollout`` corresponds to the steps taken in the environment between two updates.
|
||||
|
||||
|
||||
.. _EventCallback:
|
||||
|
||||
Event Callback
|
||||
--------------
|
||||
|
||||
Compared to Keras, Stable Baselines provides a second type of ``BaseCallback``, named ``EventCallback`` that is meant to trigger events. When an event is triggered, then a child callback is called.
|
||||
|
||||
As an example, :ref:`EvalCallback` is an ``EventCallback`` that will trigger its child callback when there is a new best model.
|
||||
A child callback is for instance :ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>` that stops the training if the mean reward achieved by the RL model is above a threshold.
|
||||
|
||||
.. note::
|
||||
|
||||
We recommend to take a look at the source code of :ref:`EvalCallback` and :ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>` to have a better overview of what can be achieved with this kind of callbacks.
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class EventCallback(BaseCallback):
|
||||
"""
|
||||
Base class for triggering callback on event.
|
||||
|
||||
:param callback: (Optional[BaseCallback]) Callback that will be called
|
||||
when an event is triggered.
|
||||
:param verbose: (int)
|
||||
"""
|
||||
def __init__(self, callback: Optional[BaseCallback] = None, verbose: int = 0):
|
||||
super(EventCallback, self).__init__(verbose=verbose)
|
||||
self.callback = callback
|
||||
# Give access to the parent
|
||||
if callback is not None:
|
||||
self.callback.parent = self
|
||||
...
|
||||
|
||||
def _on_event(self) -> bool:
|
||||
if self.callback is not None:
|
||||
return self.callback()
|
||||
return True
|
||||
|
||||
|
||||
|
||||
Callback Collection
|
||||
-------------------
|
||||
|
||||
Stable Baselines provides you with a set of common callbacks for:
|
||||
|
||||
- saving the model periodically (:ref:`CheckpointCallback`)
|
||||
- evaluating the model periodically and saving the best one (:ref:`EvalCallback`)
|
||||
- chaining callbacks (:ref:`CallbackList`)
|
||||
- triggering callback on events (:ref:`EventCallback`, :ref:`EveryNTimesteps`)
|
||||
- stopping the training early based on a reward threshold (:ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>`)
|
||||
|
||||
|
||||
.. _CheckpointCallback:
|
||||
|
||||
CheckpointCallback
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Callback for saving a model every ``save_freq`` steps, you must specify a log folder (``save_path``)
|
||||
and optionally a prefix for the checkpoints (``rl_model`` by default).
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from stable_baselines3 import SAC
|
||||
from stable_baselines3.common.callbacks import CheckpointCallback
|
||||
# Save a checkpoint every 1000 steps
|
||||
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
|
||||
name_prefix='rl_model')
|
||||
|
||||
model = SAC('MlpPolicy', 'Pendulum-v0')
|
||||
model.learn(2000, callback=checkpoint_callback)
|
||||
|
||||
|
||||
.. _EvalCallback:
|
||||
|
||||
EvalCallback
|
||||
^^^^^^^^^^^^
|
||||
|
||||
Evaluate periodically the performance of an agent, using a separate test environment.
|
||||
It will save the best model if ``best_model_save_path`` folder is specified and save the evaluations results in a numpy archive (`evaluations.npz`) if ``log_path`` folder is specified.
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
You can pass a child callback via the ``callback_on_new_best`` argument. It will be triggered each time there is a new best model.
|
||||
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
|
||||
from stable_baselines3 import SAC
|
||||
from stable_baselines3.common.callbacks import EvalCallback
|
||||
|
||||
# Separate evaluation env
|
||||
eval_env = gym.make('Pendulum-v0')
|
||||
# Use deterministic actions for evaluation
|
||||
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/',
|
||||
log_path='./logs/', eval_freq=500,
|
||||
deterministic=True, render=False)
|
||||
|
||||
model = SAC('MlpPolicy', 'Pendulum-v0')
|
||||
model.learn(5000, callback=eval_callback)
|
||||
|
||||
|
||||
.. _Callbacklist:
|
||||
|
||||
CallbackList
|
||||
^^^^^^^^^^^^
|
||||
|
||||
Class for chaining callbacks, they will be called sequentially.
|
||||
Alternatively, you can pass directly a list of callbacks to the `learn()` method, it will be converted automatically to a ``CallbackList``.
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
|
||||
from stable_baselines3 import SAC
|
||||
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
|
||||
|
||||
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/')
|
||||
# Separate evaluation env
|
||||
eval_env = gym.make('Pendulum-v0')
|
||||
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model',
|
||||
log_path='./logs/results', eval_freq=500)
|
||||
# Create the callback list
|
||||
callback = CallbackList([checkpoint_callback, eval_callback])
|
||||
|
||||
model = SAC('MlpPolicy', 'Pendulum-v0')
|
||||
# Equivalent to:
|
||||
# model.learn(5000, callback=[checkpoint_callback, eval_callback])
|
||||
model.learn(5000, callback=callback)
|
||||
|
||||
|
||||
.. _StopTrainingCallback:
|
||||
|
||||
StopTrainingOnRewardThreshold
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Stop the training once a threshold in episodic reward (mean episode reward over the evaluations) has been reached (i.e., when the model is good enough).
|
||||
It must be used with the :ref:`EvalCallback` and use the event triggered by a new best model.
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
|
||||
from stable_baselines3 import SAC
|
||||
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
|
||||
|
||||
# Separate evaluation env
|
||||
eval_env = gym.make('Pendulum-v0')
|
||||
# Stop training when the model reaches the reward threshold
|
||||
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1)
|
||||
eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1)
|
||||
|
||||
model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1)
|
||||
# Almost infinite number of timesteps, but the training will stop
|
||||
# early as soon as the reward threshold is reached
|
||||
model.learn(int(1e10), callback=eval_callback)
|
||||
|
||||
|
||||
.. _EveryNTimesteps:
|
||||
|
||||
EveryNTimesteps
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` timesteps.
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
Because of the way ``PPO1`` and ``TRPO`` work (they rely on MPI), ``n_steps`` is a lower bound between two events.
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.callbacks import CheckpointCallback, EveryNTimesteps
|
||||
|
||||
# this is equivalent to defining CheckpointCallback(save_freq=500)
|
||||
# checkpoint_callback will be triggered every 500 steps
|
||||
checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/')
|
||||
event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)
|
||||
|
||||
model = PPO('MlpPolicy', 'Pendulum-v0', verbose=1)
|
||||
|
||||
model.learn(int(2e4), callback=event_callback)
|
||||
|
||||
|
||||
.. automodule:: stable_baselines3.common.callbacks
|
||||
:members:
|
||||
164
docs/guide/checking_nan.rst
Normal file
164
docs/guide/checking_nan.rst
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
Dealing with NaNs and infs
|
||||
==========================
|
||||
|
||||
During the training of a model on a given environment, it is possible that the RL model becomes completely
|
||||
corrupted when a NaN or an inf is given or returned from the RL model.
|
||||
|
||||
How and why?
|
||||
------------
|
||||
|
||||
The issue arises then NaNs or infs do not crash, but simply get propagated through the training,
|
||||
until all the floating point number converge to NaN or inf. This is in line with the
|
||||
`IEEE Standard for Floating-Point Arithmetic (IEEE 754) <https://ieeexplore.ieee.org/document/4610935>`_ standard, as it says:
|
||||
|
||||
.. note::
|
||||
Five possible exceptions can occur:
|
||||
- Invalid operation (:math:`\sqrt{-1}`, :math:`\inf \times 1`, :math:`\text{NaN}\ \mathrm{mod}\ 1`, ...) return NaN
|
||||
- Division by zero:
|
||||
- if the operand is not zero (:math:`1/0`, :math:`-2/0`, ...) returns :math:`\pm\inf`
|
||||
- if the operand is zero (:math:`0/0`) returns signaling NaN
|
||||
- Overflow (exponent too high to represent) returns :math:`\pm\inf`
|
||||
- Underflow (exponent too low to represent) returns :math:`0`
|
||||
- Inexact (not representable exactly in base 2, eg: :math:`1/5`) returns the rounded value (ex: :code:`assert (1/5) * 3 == 0.6000000000000001`)
|
||||
|
||||
And of these, only ``Division by zero`` will signal an exception, the rest will propagate invalid values quietly.
|
||||
|
||||
In python, dividing by zero will indeed raise the exception: ``ZeroDivisionError: float division by zero``,
|
||||
but ignores the rest.
|
||||
|
||||
The default in numpy, will warn: ``RuntimeWarning: invalid value encountered``
|
||||
but will not halt the code.
|
||||
|
||||
|
||||
Anomaly detection with PyTorch
|
||||
------------------------------
|
||||
|
||||
To enable NaN detection in PyTorch you can do
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch as th
|
||||
th.autograd.set_detect_anomaly(True)
|
||||
|
||||
|
||||
Numpy parameters
|
||||
----------------
|
||||
|
||||
Numpy has a convenient way of dealing with invalid value: `numpy.seterr <https://docs.scipy.org/doc/numpy/reference/generated/numpy.seterr.html>`_,
|
||||
which defines for the python process, how it should handle floating point error.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import numpy as np
|
||||
|
||||
np.seterr(all='raise') # define before your code.
|
||||
|
||||
print("numpy test:")
|
||||
|
||||
a = np.float64(1.0)
|
||||
b = np.float64(0.0)
|
||||
val = a / b # this will now raise an exception instead of a warning.
|
||||
print(val)
|
||||
|
||||
but this will also avoid overflow issues on floating point numbers:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import numpy as np
|
||||
|
||||
np.seterr(all='raise') # define before your code.
|
||||
|
||||
print("numpy overflow test:")
|
||||
|
||||
a = np.float64(10)
|
||||
b = np.float64(1000)
|
||||
val = a ** b # this will now raise an exception
|
||||
print(val)
|
||||
|
||||
but will not avoid the propagation issues:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import numpy as np
|
||||
|
||||
np.seterr(all='raise') # define before your code.
|
||||
|
||||
print("numpy propagation test:")
|
||||
|
||||
a = np.float64('NaN')
|
||||
b = np.float64(1.0)
|
||||
val = a + b # this will neither warn nor raise anything
|
||||
print(val)
|
||||
|
||||
|
||||
VecCheckNan Wrapper
|
||||
-------------------
|
||||
|
||||
In order to find when and from where the invalid value originated from, stable-baselines3 comes with a ``VecCheckNan`` wrapper.
|
||||
|
||||
It will monitor the actions, observations, and rewards, indicating what action or observation caused it and from what.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
from gym import spaces
|
||||
import numpy as np
|
||||
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan
|
||||
|
||||
class NanAndInfEnv(gym.Env):
|
||||
"""Custom Environment that raised NaNs and Infs"""
|
||||
metadata = {'render.modes': ['human']}
|
||||
|
||||
def __init__(self):
|
||||
super(NanAndInfEnv, self).__init__()
|
||||
self.action_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64)
|
||||
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64)
|
||||
|
||||
def step(self, _action):
|
||||
randf = np.random.rand()
|
||||
if randf > 0.99:
|
||||
obs = float('NaN')
|
||||
elif randf > 0.98:
|
||||
obs = float('inf')
|
||||
else:
|
||||
obs = randf
|
||||
return [obs], 0.0, False, {}
|
||||
|
||||
def reset(self):
|
||||
return [0.0]
|
||||
|
||||
def render(self, mode='human', close=False):
|
||||
pass
|
||||
|
||||
# Create environment
|
||||
env = DummyVecEnv([lambda: NanAndInfEnv()])
|
||||
env = VecCheckNan(env, raise_exception=True)
|
||||
|
||||
# Instantiate the agent
|
||||
model = PPO('MlpPolicy', env)
|
||||
|
||||
# Train the agent
|
||||
model.learn(total_timesteps=int(2e5)) # this will crash explaining that the invalid value originated from the environment.
|
||||
|
||||
RL Model hyperparameters
|
||||
------------------------
|
||||
|
||||
Depending on your hyperparameters, NaN can occurs much more often.
|
||||
A great example of this: https://github.com/hill-a/stable-baselines/issues/340
|
||||
|
||||
Be aware, the hyperparameters given by default seem to work in most cases,
|
||||
however your environment might not play nice with them.
|
||||
If this is the case, try to read up on the effect each hyperparameters has on the model,
|
||||
so that you can try and tune them to get a stable model. Alternatively, you can try automatic hyperparameter tuning (included in the rl zoo).
|
||||
|
||||
Missing values from datasets
|
||||
----------------------------
|
||||
|
||||
If your environment is generated from an external dataset, do not forget to make sure your dataset does not contain NaNs.
|
||||
As some datasets will sometimes fill missing values with NaNs as a surrogate value.
|
||||
|
||||
Here is some reading material about finding NaNs: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
|
||||
|
||||
And filling the missing values with something else (imputation): https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4
|
||||
82
docs/guide/custom_env.rst
Normal file
82
docs/guide/custom_env.rst
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
.. _custom_env:
|
||||
|
||||
Using Custom Environments
|
||||
==========================
|
||||
|
||||
To use the rl baselines with custom environments, they just need to follow the *gym* interface.
|
||||
That is to say, your environment must implement the following methods (and inherits from OpenAI Gym Class):
|
||||
|
||||
|
||||
.. note::
|
||||
If you are using images as input, the input values must be in [0, 255] as the observation
|
||||
is normalized (dividing by 255 to have values in [0, 1]) when using CNN policies.
|
||||
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
from gym import spaces
|
||||
|
||||
class CustomEnv(gym.Env):
|
||||
"""Custom Environment that follows gym interface"""
|
||||
metadata = {'render.modes': ['human']}
|
||||
|
||||
def __init__(self, arg1, arg2, ...):
|
||||
super(CustomEnv, self).__init__()
|
||||
# Define action and observation space
|
||||
# They must be gym.spaces objects
|
||||
# Example when using discrete actions:
|
||||
self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
|
||||
# Example for using image as input:
|
||||
self.observation_space = spaces.Box(low=0, high=255,
|
||||
shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)
|
||||
|
||||
def step(self, action):
|
||||
...
|
||||
return observation, reward, done, info
|
||||
def reset(self):
|
||||
...
|
||||
return observation # reward, done, info can't be included
|
||||
def render(self, mode='human'):
|
||||
...
|
||||
def close (self):
|
||||
...
|
||||
|
||||
|
||||
Then you can define and train a RL agent with:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Instantiate the env
|
||||
env = CustomEnv(arg1, ...)
|
||||
# Define and Train the agent
|
||||
model = A2C('CnnPolicy', env).learn(total_timesteps=1000)
|
||||
|
||||
|
||||
To check that your environment follows the gym interface, please use:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from stable_baselines3.common.env_checker import check_env
|
||||
|
||||
env = CustomEnv(arg1, ...)
|
||||
# It will check your custom environment and output additional warnings if needed
|
||||
check_env(env)
|
||||
|
||||
|
||||
|
||||
We have created a `colab notebook <https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb>`_ for
|
||||
a concrete example of creating a custom environment.
|
||||
|
||||
You can also find a `complete guide online <https://github.com/openai/gym/blob/master/docs/creating-environments.md>`_
|
||||
on creating a custom Gym environment.
|
||||
|
||||
|
||||
Optionally, you can also register the environment with gym,
|
||||
that will allow you to create the RL agent in one line (and use ``gym.make()`` to instantiate the env).
|
||||
|
||||
|
||||
In the project, for testing purposes, we use a custom environment named ``IdentityEnv``
|
||||
defined `in this file <https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/identity_env.py>`_.
|
||||
An example of how to use it can be found `here <https://github.com/hill-a/stable-baselines/blob/master/tests/test_identity.py>`_.
|
||||
152
docs/guide/install.rst
Normal file
152
docs/guide/install.rst
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
.. _install:
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
Stable-Baselines3 requires python 3.6+.
|
||||
|
||||
Windows 10
|
||||
~~~~~~~~~~
|
||||
|
||||
We recommend using `Anaconda <https://conda.io/docs/user-guide/install/windows.html>`_ for Windows users for easier installation of Python packages and required libraries. You need an environment with Python version 3.6 or above.
|
||||
|
||||
For a quick start you can move straight to installing Stable-Baselines3 in the next step.
|
||||
|
||||
.. note::
|
||||
|
||||
Trying to create Atari environments may result to vague errors related to missing DLL files and modules. This is an
|
||||
issue with atari-py package. `See this discussion for more information <https://github.com/openai/atari-py/issues/65>`_.
|
||||
|
||||
|
||||
Stable Release
|
||||
~~~~~~~~~~~~~~
|
||||
To install Stable Baselines3 with pip, execute:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install stable-baselines3[extra]
|
||||
|
||||
This includes an optional dependency OpenCV to display the environments when using ``SubprocVecEnv``. If you do not need it, you can install without OpenCV:
|
||||
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install stable-baselines3
|
||||
|
||||
|
||||
Bleeding-edge version
|
||||
---------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install git+https://github.com/DLR-RM/stable-baselines3
|
||||
|
||||
|
||||
Development verion
|
||||
------------------
|
||||
|
||||
To contribute to Stable-Baselines3, with support for running tests and building the documentation.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
git clone https://github.com/DLR-RM/stable-baselines3 && cd stable-baselines3
|
||||
pip install -e .[docs,tests,extra]
|
||||
|
||||
|
||||
.. Using Docker Images
|
||||
.. -------------------
|
||||
..
|
||||
.. If you are looking for docker images with stable-baselines already installed in it,
|
||||
.. we recommend using images from `RL Baselines3 Zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_.
|
||||
..
|
||||
.. Otherwise, the following images contained all the dependencies for stable-baselines3 but not the stable-baselines3 package itself.
|
||||
.. They are made for development.
|
||||
..
|
||||
.. Use Built Images
|
||||
.. ~~~~~~~~~~~~~~~~
|
||||
..
|
||||
.. GPU image (requires `nvidia-docker`_):
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. docker pull stablebaselines/stable-baselines3
|
||||
..
|
||||
.. CPU only:
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. docker pull stablebaselines/stable-baselines3-cpu
|
||||
..
|
||||
.. Build the Docker Images
|
||||
.. ~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
..
|
||||
.. Build GPU image (with nvidia-docker):
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. make docker-gpu
|
||||
..
|
||||
.. Build CPU image:
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. make docker-cpu
|
||||
..
|
||||
.. Note: if you are using a proxy, you need to pass extra params during
|
||||
.. build and do some `tweaks`_:
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. --network=host --build-arg HTTP_PROXY=http://your.proxy.fr:8080/ --build-arg http_proxy=http://your.proxy.fr:8080/ --build-arg HTTPS_PROXY=https://your.proxy.fr:8080/ --build-arg https_proxy=https://your.proxy.fr:8080/
|
||||
..
|
||||
.. Run the images (CPU/GPU)
|
||||
.. ~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
..
|
||||
.. Run the nvidia-docker GPU image
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. docker run -it --runtime=nvidia --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines bash -c 'cd /root/code/stable-baselines/ && pytest tests/'
|
||||
..
|
||||
.. Or, with the shell file:
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. ./scripts/run_docker_gpu.sh pytest tests/
|
||||
..
|
||||
.. Run the docker CPU image
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. docker run -it --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines-cpu bash -c 'cd /root/code/stable-baselines/ && pytest tests/'
|
||||
..
|
||||
.. Or, with the shell file:
|
||||
..
|
||||
.. .. code-block:: bash
|
||||
..
|
||||
.. ./scripts/run_docker_cpu.sh pytest tests/
|
||||
..
|
||||
.. Explanation of the docker command:
|
||||
..
|
||||
.. - ``docker run -it`` create an instance of an image (=container), and
|
||||
.. run it interactively (so ctrl+c will work)
|
||||
.. - ``--rm`` option means to remove the container once it exits/stops
|
||||
.. (otherwise, you will have to use ``docker rm``)
|
||||
.. - ``--network host`` don't use network isolation, this allow to use
|
||||
.. tensorboard/visdom on host machine
|
||||
.. - ``--ipc=host`` Use the host system’s IPC namespace. IPC (POSIX/SysV IPC) namespace provides
|
||||
.. separation of named shared memory segments, semaphores and message
|
||||
.. queues.
|
||||
.. - ``--name test`` give explicitly the name ``test`` to the container,
|
||||
.. otherwise it will be assigned a random name
|
||||
.. - ``--mount src=...`` give access of the local directory (``pwd``
|
||||
.. command) to the container (it will be map to ``/root/code/stable-baselines``), so
|
||||
.. all the logs created in the container in this folder will be kept
|
||||
.. - ``bash -c '...'`` Run command inside the docker image, here run the tests
|
||||
.. (``pytest tests/``)
|
||||
..
|
||||
.. .. _nvidia-docker: https://github.com/NVIDIA/nvidia-docker
|
||||
.. .. _tweaks: https://stackoverflow.com/questions/23111631/cannot-download-docker-images-behind-a-proxy
|
||||
12
docs/guide/migration.rst
Normal file
12
docs/guide/migration.rst
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
.. _migration:
|
||||
|
||||
================================
|
||||
Migrating from Stable-Baselines
|
||||
================================
|
||||
|
||||
|
||||
This is a guide to migrate from Stable-Baselines to Stable-Baselines3.
|
||||
|
||||
It also references the main changes.
|
||||
|
||||
**TODO**
|
||||
|
|
@ -6,26 +6,27 @@ Getting Started
|
|||
|
||||
Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms.
|
||||
|
||||
Here is a quick example of how to train and run SAC on a Pendulum environment:
|
||||
Here is a quick example of how to train and run A2C on a CartPole environment:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
|
||||
from stable_baselines3.sac.policies import MlpPolicy
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv
|
||||
from stable_baselines3 import SAC
|
||||
from stable_baselines3 import A2C
|
||||
from stable_baselines3.a2c import MlpPolicy
|
||||
|
||||
env = gym.make('Pendulum-v0')
|
||||
env = gym.make('CartPole-v1')
|
||||
|
||||
model = SAC(MlpPolicy, env, verbose=1)
|
||||
model = A2C(MlpPolicy, env, verbose=1)
|
||||
model.learn(total_timesteps=10000)
|
||||
|
||||
obs = env.reset()
|
||||
for i in range(1000):
|
||||
action = model.predict(obs)
|
||||
obs, rewards, dones, info = env.step(action)
|
||||
action, _state = model.predict(obs, deterministic=True)
|
||||
obs, reward, done, info = env.step(action)
|
||||
env.render()
|
||||
if done:
|
||||
obs = env.reset()
|
||||
|
||||
|
||||
Or just train a model with a one liner if
|
||||
|
|
@ -34,6 +35,6 @@ the policy is registered:
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from stable_baselines3 import SAC
|
||||
from stable_baselines3 import A2C
|
||||
|
||||
model = SAC('MlpPolicy', 'Pendulum-v0').learn(10000)
|
||||
model = A2C('MlpPolicy', 'CartPole-v1').learn(10000)
|
||||
|
|
|
|||
17
docs/guide/rl.rst
Normal file
17
docs/guide/rl.rst
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
.. _rl:
|
||||
|
||||
================================
|
||||
Reinforcement Learning Resources
|
||||
================================
|
||||
|
||||
|
||||
Stable-Baselines3 assumes that you already understand the basic concepts of Reinforcement Learning (RL).
|
||||
|
||||
However, if you want to learn about RL, there are several good resources to get started:
|
||||
|
||||
- `OpenAI Spinning Up <https://spinningup.openai.com/en/latest/>`_
|
||||
- `David Silver's course <http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html>`_
|
||||
- `Lilian Weng's blog <https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html>`_
|
||||
- `Berkeley's Deep RL Bootcamp <https://sites.google.com/view/deep-rl-bootcamp/lectures>`_
|
||||
- `Berkeley's Deep Reinforcement Learning course <http://rail.eecs.berkeley.edu/deeprlcourse/>`_
|
||||
- `More resources <https://github.com/dennybritz/reinforcement-learning>`_
|
||||
251
docs/guide/rl_tips.rst
Normal file
251
docs/guide/rl_tips.rst
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
.. _rl_tips:
|
||||
|
||||
======================================
|
||||
Reinforcement Learning Tips and Tricks
|
||||
======================================
|
||||
|
||||
The aim of this section is to help you doing reinforcement learning experiments.
|
||||
It covers general advice about RL (where to start, which algorithm to choose, how to evaluate an algorithm, ...),
|
||||
as well as tips and tricks when using a custom environment or implementing an RL algorithm.
|
||||
|
||||
|
||||
General advice when using Reinforcement Learning
|
||||
================================================
|
||||
|
||||
TL;DR
|
||||
-----
|
||||
|
||||
1. Read about RL and Stable Baselines
|
||||
2. Do quantitative experiments and hyperparameter tuning if needed
|
||||
3. Evaluate the performance using a separate test environment
|
||||
4. For better performance, increase the training budget
|
||||
|
||||
|
||||
Like any other subject, if you want to work with RL, you should first read about it (we have a dedicated `resource page <rl.html>`_ to get you started)
|
||||
to understand what you are using. We also recommend you read Stable Baselines (SB) documentation and do the `tutorial <https://github.com/araffin/rl-tutorial-jnrr19>`_.
|
||||
It covers basic usage and guide you towards more advanced concepts of the library (e.g. callbacks and wrappers).
|
||||
|
||||
Reinforcement Learning differs from other machine learning methods in several ways. The data used to train the agent is collected
|
||||
through interactions with the environment by the agent itself (compared to supervised learning where you have a fixed dataset for instance).
|
||||
This dependence can lead to vicious circle: if the agent collects poor quality data (e.g., trajectories with no rewards), then it will not improve and continue to amass
|
||||
bad trajectories.
|
||||
|
||||
This factor, among others, explains that results in RL may vary from one run to another (i.e., when only the seed of the pseudo-random generator changes).
|
||||
For this reason, you should always do several runs to have quantitative results.
|
||||
|
||||
Good results in RL are generally dependent on finding appropriate hyperparameters. Recent algorithms (PPO, SAC, TD3) normally require little hyperparameter tuning,
|
||||
however, *don't expect the default ones to work* on any environment.
|
||||
|
||||
Therefore, we *highly recommend you* to take a look at the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_ (or the original papers) for tuned hyperparameters.
|
||||
A best practice when you apply RL to a new problem is to do automatic hyperparameter optimization. Again, this is included in the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_.
|
||||
|
||||
When applying RL to a custom problem, you should always normalize the input to the agent (e.g. using VecNormalize for PPO2/A2C)
|
||||
and look at common preprocessing done on other environments (e.g. for `Atari <https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/>`_, frame-stack, ...).
|
||||
Please refer to *Tips and Tricks when creating a custom environment* paragraph below for more advice related to custom environments.
|
||||
|
||||
|
||||
Current Limitations of RL
|
||||
-------------------------
|
||||
|
||||
You have to be aware of the current `limitations <https://www.alexirpan.com/2018/02/14/rl-hard.html>`_ of reinforcement learning.
|
||||
|
||||
|
||||
Model-free RL algorithms (i.e. all the algorithms implemented in SB) are usually *sample inefficient*. They require a lot of samples (sometimes millions of interactions) to learn something useful.
|
||||
That's why most of the successes in RL were achieved on games or in simulation only. For instance, in this `work <https://www.youtube.com/watch?v=aTDkYFZFWug>`_ by ETH Zurich, the ANYmal robot was trained in simulation only, and then tested in the real world.
|
||||
|
||||
As a general advice, to obtain better performances, you should augment the budget of the agent (number of training timesteps).
|
||||
|
||||
|
||||
In order to achieve the desired behavior, expert knowledge is often required to design an adequate reward function.
|
||||
This *reward engineering* (or *RewArt* as coined by `Freek Stulp <http://www.freekstulp.net/>`_), necessitates several iterations. As a good example of reward shaping,
|
||||
you can take a look at `Deep Mimic paper <https://xbpeng.github.io/projects/DeepMimic/index.html>`_ which combines imitation learning and reinforcement learning to do acrobatic moves.
|
||||
|
||||
One last limitation of RL is the instability of training. That is to say, you can observe during training a huge drop in performance.
|
||||
This behavior is particularly present in ``DDPG``, that's why its extension ``TD3`` tries to tackle that issue.
|
||||
Other method, like ``TRPO`` or ``PPO`` make use of a *trust region* to minimize that problem by avoiding too large update.
|
||||
|
||||
|
||||
How to evaluate an RL algorithm?
|
||||
--------------------------------
|
||||
|
||||
Because most algorithms use exploration noise during training, you need a separate test environment to evaluate the performance
|
||||
of your agent at a given time. It is recommended to periodically evaluate your agent for ``n`` test episodes (``n`` is usually between 5 and 20)
|
||||
and average the reward per episode to have a good estimate.
|
||||
|
||||
As some policy are stochastic by default (e.g. A2C or PPO), you should also try to set `deterministic=True` when calling the `.predict()` method,
|
||||
this frequently leads to better performance.
|
||||
Looking at the training curve (episode reward function of the timesteps) is a good proxy but underestimates the agent true performance.
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
We provide an ``EvalCallback`` for doing such evaluation. You can read more about it in the :ref:`Callbacks <callbacks>` section.
|
||||
|
||||
|
||||
|
||||
We suggest you reading `Deep Reinforcement Learning that Matters <https://arxiv.org/abs/1709.06560>`_ for a good discussion about RL evaluation.
|
||||
|
||||
You can also take a look at this `blog post <https://openlab-flowers.inria.fr/t/how-many-random-seeds-should-i-use-statistical-power-analysis-in-deep-reinforcement-learning-experiments/457>`_
|
||||
and this `issue <https://github.com/hill-a/stable-baselines/issues/199>`_ by Cédric Colas.
|
||||
|
||||
|
||||
Which algorithm should I use?
|
||||
=============================
|
||||
|
||||
There is no silver bullet in RL, depending on your needs and problem, you may choose one or the other.
|
||||
The first distinction comes from your action space, i.e., do you have discrete (e.g. LEFT, RIGHT, ...)
|
||||
or continuous actions (ex: go to a certain speed)?
|
||||
|
||||
Some algorithms are only tailored for one or the other domain: ``DQN`` only supports discrete actions, where ``SAC`` is restricted to continuous actions.
|
||||
|
||||
The second difference that will help you choose is whether you can parallelize your training or not.
|
||||
If what matters is the wall clock training time, then you should lean towards ``A2C`` and its derivatives (PPO, ...).
|
||||
Take a look at the `Vectorized Environments <vec_envs.html>`_ to learn more about training with multiple workers.
|
||||
|
||||
To sum it up:
|
||||
|
||||
Discrete Actions
|
||||
----------------
|
||||
|
||||
.. note::
|
||||
|
||||
This covers ``Discrete``, ``MultiDiscrete``, ``Binary`` and ``MultiBinary`` spaces
|
||||
|
||||
|
||||
Discrete Actions - Single Process
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
DQN with extensions (double DQN, prioritized replay, ...) are the recommended algorithms.
|
||||
DQN is usually slower to train (regarding wall clock time) but is the most sample efficient (because of its replay buffer).
|
||||
|
||||
Discrete Actions - Multiprocessed
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
You should give a try to PPO or A2C.
|
||||
|
||||
|
||||
Continuous Actions
|
||||
------------------
|
||||
|
||||
Continuous Actions - Single Process
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Current State Of The Art (SOTA) algorithms are ``SAC`` and ``TD3``.
|
||||
Please use the hyperparameters in the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_ for best results.
|
||||
|
||||
|
||||
Continuous Actions - Multiprocessed
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Take a look at PPO2, TRPO or A2C. Again, don't forget to take the hyperparameters from the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_
|
||||
for continuous actions problems (cf *Bullet* envs).
|
||||
|
||||
.. note::
|
||||
|
||||
Normalization is critical for those algorithms
|
||||
|
||||
|
||||
|
||||
.. Goal Environment
|
||||
.. -----------------
|
||||
..
|
||||
.. If your environment follows the ``GoalEnv`` interface (cf `HER <../modules/her.html>`_), then you should use
|
||||
.. HER + (SAC/TD3/DDPG/DQN) depending on the action space.
|
||||
..
|
||||
..
|
||||
.. .. note::
|
||||
..
|
||||
.. The number of workers is an important hyperparameters for experiments with HER
|
||||
..
|
||||
|
||||
|
||||
Tips and Tricks when creating a custom environment
|
||||
==================================================
|
||||
|
||||
If you want to learn about how to create a custom environment, we recommend you read this `page <custom_env.html>`_.
|
||||
We also provide a `colab notebook <https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb>`_ for
|
||||
a concrete example of creating a custom gym environment.
|
||||
|
||||
Some basic advice:
|
||||
|
||||
- always normalize your observation space when you can, i.e., when you know the boundaries
|
||||
- normalize your action space and make it symmetric when continuous (cf potential issue below) A good practice is to rescale your actions to lie in [-1, 1]. This does not limit you as you can easily rescale the action inside the environment
|
||||
- start with shaped reward (i.e. informative reward) and simplified version of your problem
|
||||
- debug with random actions to check that your environment works and follows the gym interface:
|
||||
|
||||
|
||||
We provide a helper to check that your environment runs without error:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from stable_baselines3.common.env_checker import check_env
|
||||
|
||||
env = CustomEnv(arg1, ...)
|
||||
# It will check your custom environment and output additional warnings if needed
|
||||
check_env(env)
|
||||
|
||||
|
||||
If you want to quickly try a random agent on your environment, you can also do:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
env = YourEnv()
|
||||
obs = env.reset()
|
||||
n_steps = 10
|
||||
for _ in range(n_steps):
|
||||
# Random action
|
||||
action = env.action_space.sample()
|
||||
obs, reward, done, info = env.step(action)
|
||||
if done:
|
||||
obs = env.reset()
|
||||
|
||||
|
||||
**Why should I normalize the action space?**
|
||||
|
||||
|
||||
Most reinforcement learning algorithms rely on a Gaussian distribution (initially centered at 0 with std 1) for continuous actions.
|
||||
So, if you forget to normalize the action space when using a custom environment,
|
||||
this can harm learning and be difficult to debug (cf attached image and `issue #473 <https://github.com/hill-a/stable-baselines/issues/473>`_).
|
||||
|
||||
.. figure:: ../_static/img/mistake.png
|
||||
|
||||
|
||||
Another consequence of using a Gaussian is that the action range is not bounded.
|
||||
That's why clipping is usually used as a bandage to stay in a valid interval.
|
||||
A better solution would be to use a squashing function (cf ``SAC``) or a Beta distribution (cf `issue #112 <https://github.com/hill-a/stable-baselines/issues/112>`_).
|
||||
|
||||
.. note::
|
||||
|
||||
This statement is not true for ``DDPG`` or ``TD3`` because they don't rely on any probability distribution.
|
||||
|
||||
|
||||
|
||||
Tips and Tricks when implementing an RL algorithm
|
||||
=================================================
|
||||
|
||||
When you try to reproduce a RL paper by implementing the algorithm, the `nuts and bolts of RL research <http://joschu.net/docs/nuts-and-bolts.pdf>`_
|
||||
by John Schulman are quite useful (`video <https://www.youtube.com/watch?v=8EcdaCk9KaQ>`_).
|
||||
|
||||
We *recommend following those steps to have a working RL algorithm*:
|
||||
|
||||
1. Read the original paper several times
|
||||
2. Read existing implementations (if available)
|
||||
3. Try to have some "sign of life" on toy problems
|
||||
4. Validate the implementation by making it run on harder and harder envs (you can compare results against the RL zoo)
|
||||
You usually need to run hyperparameter optimization for that step.
|
||||
|
||||
You need to be particularly careful on the shape of the different objects you are manipulating (a broadcast mistake will fail silently cf `issue #75 <https://github.com/hill-a/stable-baselines/pull/76>`_)
|
||||
and when to stop the gradient propagation.
|
||||
|
||||
A personal pick (by @araffin) for environments with gradual difficulty in RL with continuous actions:
|
||||
|
||||
1. Pendulum (easy to solve)
|
||||
2. HalfCheetahBullet (medium difficulty with local minima and shaped reward)
|
||||
3. BipedalWalkerHardcore (if it works on that one, then you can have a cookie)
|
||||
|
||||
in RL with discrete actions:
|
||||
|
||||
1. CartPole-v1 (easy to be better than random agent, harder to achieve maximal performance)
|
||||
2. LunarLander
|
||||
3. Pong (one of the easiest Atari game)
|
||||
4. other Atari games (e.g. Breakout)
|
||||
|
|
@ -6,11 +6,11 @@ Vectorized Environments
|
|||
=======================
|
||||
|
||||
Vectorized Environments are a method for stacking multiple independent environments into a single environment.
|
||||
Instead of training an RL agent on 1 environment per step, it allows us to train it on `n` environments per step.
|
||||
Because of this, `actions` passed to the environment are now a vector (of dimension `n`).
|
||||
It is the same for `observations`, `rewards` and end of episode signals (`dones`).
|
||||
In the case of non-array observation spaces such as `Dict` or `Tuple`, where different sub-spaces
|
||||
may have different shapes, the sub-observations are vectors (of dimension `n`).
|
||||
Instead of training an RL agent on 1 environment per step, it allows us to train it on ``n`` environments per step.
|
||||
Because of this, ``actions`` passed to the environment are now a vector (of dimension ``n``).
|
||||
It is the same for ``observations``, ``rewards`` and end of episode signals (``dones``).
|
||||
In the case of non-array observation spaces such as ``Dict`` or ``Tuple``, where different sub-spaces
|
||||
may have different shapes, the sub-observations are vectors (of dimension ``n``).
|
||||
|
||||
============= ======= ============ ======== ========= ================
|
||||
Name ``Box`` ``Discrete`` ``Dict`` ``Tuple`` Multi Processing
|
||||
|
|
@ -27,7 +27,7 @@ SubprocVecEnv ✔️ ✔️ ✔️ ✔️ ✔️
|
|||
|
||||
When using vectorized environments, the environments are automatically reset at the end of each episode.
|
||||
Thus, the observation returned for the i-th environment when ``done[i]`` is true will in fact be the first observation of the next episode, not the last observation of the episode that has just terminated.
|
||||
You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the `VecEnv`.
|
||||
You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the vecenv.
|
||||
|
||||
.. warning::
|
||||
|
||||
|
|
@ -69,3 +69,24 @@ VecNormalize
|
|||
|
||||
.. autoclass:: VecNormalize
|
||||
:members:
|
||||
|
||||
|
||||
VecVideoRecorder
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: VecVideoRecorder
|
||||
:members:
|
||||
|
||||
|
||||
VecCheckNan
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: VecCheckNan
|
||||
:members:
|
||||
|
||||
|
||||
VecTransposeImage
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: VecTransposeImage
|
||||
:members:
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
.. Stable Baselines documentation master file, created by
|
||||
.. Stable Baselines3 documentation master file, created by
|
||||
sphinx-quickstart on Thu Sep 26 11:06:54 2019.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
|
@ -6,21 +6,41 @@
|
|||
Welcome to Stable Baselines3 docs!
|
||||
==================================
|
||||
|
||||
`Stable Baselines3 <https://github.com/DLR-RM/stable-baselines3>`_ is the next major version (PyTorch edition) of `Stable Baselines <https://github.com/hill-a/stable-baselines>`_,
|
||||
a set of improved implementations of reinforcement learning algorithms.
|
||||
`Stable Baselines3 <https://github.com/DLR-RM/stable-baselines3>`_ is a set of improved implementations of reinforcement learning algorithms in PyTorch.
|
||||
It is the next major version (PyTorch edition) of `Stable Baselines <https://github.com/hill-a/stable-baselines>`_.
|
||||
|
||||
|
||||
Github repository: https://github.com/DLR-RM/stable-baselines3
|
||||
|
||||
RL Baselines3 Zoo (collection of pre-trained agents): https://github.com/DLR-RM/rl-baselines3-zoo
|
||||
|
||||
RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and do hyperparameter tuning.
|
||||
|
||||
|
||||
Main Features
|
||||
--------------
|
||||
|
||||
- Unified structure for all algorithms
|
||||
- PEP8 compliant (unified code style)
|
||||
- Documented functions and classes
|
||||
- Tests, high code coverage and type hints
|
||||
- Clean code
|
||||
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: User Guide
|
||||
|
||||
guide/install
|
||||
guide/quickstart
|
||||
guide/rl_tips
|
||||
guide/rl
|
||||
guide/vec_envs
|
||||
guide/custom_env
|
||||
guide/callbacks
|
||||
guide/migration
|
||||
guide/checking_nan
|
||||
|
||||
|
||||
.. toctree::
|
||||
|
|
@ -33,12 +53,20 @@ RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and d
|
|||
modules/sac
|
||||
modules/td3
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Common
|
||||
|
||||
common/distributions
|
||||
common/evaluation
|
||||
common/env_checker
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Misc
|
||||
|
||||
misc/changelog
|
||||
misc/projects
|
||||
|
||||
|
||||
Citing Stable Baselines3
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ Others:
|
|||
|
||||
Documentation:
|
||||
^^^^^^^^^^^^^^
|
||||
- Added most documentation (adapted from Stable-Baselines)
|
||||
|
||||
|
||||
Pre-Release 0.5.0 (2020-05-05)
|
||||
|
|
|
|||
26
docs/misc/projects.rst
Normal file
26
docs/misc/projects.rst
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
.. _projects:
|
||||
|
||||
Projects
|
||||
=========
|
||||
|
||||
This is a list of projects using stable-baselines3.
|
||||
Please tell us, if you want your project to appear on this page ;)
|
||||
|
||||
|
||||
.. RL Racing Robot
|
||||
.. --------------------------
|
||||
.. Implementation of reinforcement learning approach to make a donkey car learn to race.
|
||||
.. Uses SAC on autoencoder features
|
||||
..
|
||||
.. | Author: Antonin Raffin (@araffin)
|
||||
.. | Github repo: https://github.com/araffin/RL-Racing-Robot
|
||||
|
||||
|
||||
.. Generalized State Dependent Exploration for Deep Reinforcement Learning in Robotics
|
||||
.. -----------------------------------------------------------------------------------
|
||||
..
|
||||
.. An exploration method to train RL agent directly on real robots.
|
||||
..
|
||||
.. | Author: Antonin Raffin, Freek Stulp
|
||||
.. | Github: https://github.com/DLR-RM/stable-baselines3/tree/sde
|
||||
.. | Paper:
|
||||
|
|
@ -38,15 +38,15 @@ MultiBinary ❌ ❌
|
|||
Example
|
||||
-------
|
||||
|
||||
Train a A2C agent on `CartPole-v1` using 4 processes.
|
||||
Train a A2C agent on ``CartPole-v1`` using 4 environments.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
|
||||
from stable_baselines3.common.policies import MlpPolicy
|
||||
from stable_baselines3.common import make_vec_env
|
||||
from stable_baselines3 import A2C
|
||||
from stable_baselines3.a2c import MlpPolicy
|
||||
from stable_baselines3.common.cmd_utils import make_vec_env
|
||||
|
||||
# Parallel environments
|
||||
env = make_vec_env('CartPole-v1', n_envs=4)
|
||||
|
|
|
|||
|
|
@ -10,3 +10,12 @@ Common interface for all the RL algorithms
|
|||
|
||||
.. autoclass:: BaseRLModel
|
||||
:members:
|
||||
|
||||
|
||||
Base RL Class
|
||||
=============
|
||||
|
||||
The base RL model for Off-Policy algorithm (ex: SAC/TD3)
|
||||
|
||||
.. autoclass:: OffPolicyRLModel
|
||||
:members:
|
||||
|
|
|
|||
|
|
@ -47,34 +47,32 @@ MultiBinary ❌ ❌
|
|||
Example
|
||||
-------
|
||||
|
||||
Train a PPO agent on `Pendulum-v0` using 4 processes.
|
||||
Train a PPO agent on ``Pendulum-v0`` using 4 environments.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gym
|
||||
import gym
|
||||
|
||||
from stable_baselines3.ppo.policies import MlpPolicy
|
||||
from stable_baselines3.common.vec_env import SubprocVecEnv
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3 import A2C
|
||||
from stable_baselines3.ppo import MlpPolicy
|
||||
from stable_baselines3.common.cmd_utils import make_vec_env
|
||||
|
||||
# multiprocess environment
|
||||
n_cpu = 4
|
||||
env = SubprocVecEnv([lambda: gym.make('Pendulum-v0') for i in range(n_cpu)])
|
||||
# Parallel environments
|
||||
env = make_vec_env('CartPole-v1', n_envs=4)
|
||||
|
||||
model = PPO(MlpPolicy, env, verbose=1)
|
||||
model.learn(total_timesteps=25000)
|
||||
model.save("ppo2_cartpole")
|
||||
model = PPO(MlpPolicy, env, verbose=1)
|
||||
model.learn(total_timesteps=25000)
|
||||
model.save("ppo_cartpole")
|
||||
|
||||
del model # remove to demonstrate saving and loading
|
||||
del model # remove to demonstrate saving and loading
|
||||
|
||||
model = PPO.load("ppo2_cartpole")
|
||||
model = PPO.load("ppo_cartpole")
|
||||
|
||||
# Enjoy trained agent
|
||||
obs = env.reset()
|
||||
while True:
|
||||
action, _states = model.predict(obs)
|
||||
obs, rewards, dones, info = env.step(action)
|
||||
env.render()
|
||||
obs = env.reset()
|
||||
while True:
|
||||
action, _states = model.predict(obs)
|
||||
obs, rewards, dones, info = env.step(action)
|
||||
env.render()
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ A key feature of SAC, and a major difference with common RL algorithms, is that
|
|||
|
||||
.. warning::
|
||||
|
||||
The SAC model does not support ``stable_baselines3.common.policies`` because it uses double q-values
|
||||
The SAC model does not support ``stable_baselines3.ppo.policies`` because it uses double q-values
|
||||
and value estimation, as a result it must use its own policy models (see :ref:`sac_policies`).
|
||||
|
||||
|
||||
|
|
@ -24,6 +24,7 @@ A key feature of SAC, and a major difference with common RL algorithms, is that
|
|||
:nosignatures:
|
||||
|
||||
MlpPolicy
|
||||
CnnPolicy
|
||||
|
||||
|
||||
Notes
|
||||
|
|
@ -72,15 +73,13 @@ Example
|
|||
import gym
|
||||
import numpy as np
|
||||
|
||||
from stable_baselines3.sac.policies import MlpPolicy
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv
|
||||
from stable_baselines3 import SAC
|
||||
from stable_baselines3.sac import MlpPolicy
|
||||
|
||||
env = gym.make('Pendulum-v0')
|
||||
env = DummyVecEnv([lambda: env])
|
||||
|
||||
model = SAC(MlpPolicy, env, verbose=1)
|
||||
model.learn(total_timesteps=50000, log_interval=10)
|
||||
model.learn(total_timesteps=10000, log_interval=4)
|
||||
model.save("sac_pendulum")
|
||||
|
||||
del model # remove to demonstrate saving and loading
|
||||
|
|
@ -90,8 +89,10 @@ Example
|
|||
obs = env.reset()
|
||||
while True:
|
||||
action, _states = model.predict(obs)
|
||||
obs, rewards, dones, info = env.step(action)
|
||||
obs, reward, done, info = env.step(action)
|
||||
env.render()
|
||||
if done:
|
||||
obs = env.reset()
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -108,3 +109,7 @@ SAC Policies
|
|||
.. autoclass:: MlpPolicy
|
||||
:members:
|
||||
:inherited-members:
|
||||
|
||||
.. .. autoclass:: CnnPolicy
|
||||
.. :members:
|
||||
.. :inherited-members:
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ We recommend reading `OpenAI Spinning guide on TD3 <https://spinningup.openai.co
|
|||
|
||||
.. warning::
|
||||
|
||||
The TD3 model does not support ``stable_baselines3.common.policies`` because it uses double q-values
|
||||
The TD3 model does not support ``stable_baselines3.ppo.policies`` because it uses double q-values
|
||||
estimation, as a result it must use its own policy models (see :ref:`td3_policies`).
|
||||
|
||||
|
||||
|
|
@ -73,7 +73,7 @@ Example
|
|||
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
|
||||
|
||||
model = TD3(MlpPolicy, 'Pendulum-v0', action_noise=action_noise, verbose=1)
|
||||
model.learn(total_timesteps=50000, log_interval=10)
|
||||
model.learn(total_timesteps=10000, log_interval=10)
|
||||
model.save("td3_pendulum")
|
||||
env = model.get_env()
|
||||
|
||||
|
|
@ -87,6 +87,7 @@ Example
|
|||
obs, rewards, dones, info = env.step(action)
|
||||
env.render()
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
|
|
@ -102,3 +103,8 @@ TD3 Policies
|
|||
.. autoclass:: MlpPolicy
|
||||
:members:
|
||||
:inherited-members:
|
||||
|
||||
|
||||
.. .. autoclass:: CnnPolicy
|
||||
.. :members:
|
||||
.. :inherited-members:
|
||||
|
|
|
|||
4
setup.py
4
setup.py
|
|
@ -50,8 +50,10 @@ model.learn(total_timesteps=10000)
|
|||
obs = env.reset()
|
||||
for i in range(1000):
|
||||
action, _states = model.predict(obs, deterministic=True)
|
||||
obs, rewards, dones, info = env.step(action)
|
||||
obs, reward, done, info = env.step(action)
|
||||
env.render()
|
||||
if done:
|
||||
obs = env.reset()
|
||||
```
|
||||
|
||||
Or just train a model with a one liner if [the environment is registered in Gym](https://github.com/openai/gym/wiki/Environments) and if [the policy is registered](https://stable-baselines.readthedocs.io/en/master/guide/custom_policy.html):
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
from stable_baselines3.a2c.a2c import A2C
|
||||
from stable_baselines3.ppo.policies import MlpPolicy
|
||||
from stable_baselines3.ppo.policies import MlpPolicy, CnnPolicy
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
from stable_baselines3.ppo.ppo import PPO
|
||||
from stable_baselines3.ppo.policies import MlpPolicy
|
||||
from stable_baselines3.ppo.policies import MlpPolicy, CnnPolicy
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
from stable_baselines3.sac.sac import SAC
|
||||
from stable_baselines3.sac.policies import MlpPolicy
|
||||
from stable_baselines3.sac.policies import MlpPolicy, CnnPolicy
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
from stable_baselines3.td3.td3 import TD3
|
||||
from stable_baselines3.td3.policies import MlpPolicy
|
||||
from stable_baselines3.td3.policies import MlpPolicy, CnnPolicy
|
||||
|
|
|
|||
Loading…
Reference in a new issue