Add base doc

This commit is contained in:
Antonin RAFFIN 2020-05-07 10:10:51 +02:00
parent 94b1267817
commit d17f29c8ad
29 changed files with 1194 additions and 58 deletions

View file

@ -35,6 +35,29 @@ These algorithms will make it easier for the research community and industry to
<!-- | Tensorboard support | :heavy_check_mark: | -->
### Roadmap to V1.0
Please look at the issue for more details.
Planned features:
- [ ] DQN (almost ready, currently in testing phase)
- [ ] DDPG (you can use its successor TD3 for now)
- [ ] HER
- [ ] Support for MultiDiscrete and MultiBinary action spaces
### Planned features (v1.1+)
- [ ] Full Tensorboard support
- [ ] DQN extensions (prioritized replay, double q-learning, ...)
- [ ] Support for `Tuple` and `Dict` observation spaces
- [ ] Recurrent Policies
- [ ] TRPO
## Migration guide
**TODO: migration guide from Stable-Baselines in the documentation**
## Documentation
Documentation is available online: [https://stable-baselines.readthedocs.io/](https://stable-baselines.readthedocs.io/)
@ -102,8 +125,10 @@ model.learn(total_timesteps=10000)
obs = env.reset()
for i in range(1000):
action, _states = model.predict(obs, deterministic=True)
obs, rewards, dones, info = env.step(action)
obs, reward, done, info = env.step(action)
env.render()
if done:
obs = env.reset()
env.close()
```

View file

@ -1,4 +1,4 @@
## Stable Baselines Documentation
## Stable Baselines3 Documentation
This folder contains documentation for the RL baselines.

BIN
docs/_static/img/mistake.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

View file

@ -0,0 +1,26 @@
.. _distributions:
Probability Distributions
=========================
Probability distributions used for the different action spaces:
- ``CategoricalDistribution`` -> Discrete
- ``DiagGaussianDistribution`` -> Box (continuous actions)
- ``StateDependentNoiseDistribution`` -> Box (continuous actions) when ``use_sde=True``
.. - ``MultiCategoricalDistribution`` -> MultiDiscrete
.. - ``BernoulliDistribution`` -> MultiBinary
The policy networks output parameters for the distributions (named ``flat`` in the methods).
Actions are then sampled from those distributions.
For instance, in the case of discrete actions. The policy network outputs probability
of taking each action. The ``CategoricalDistribution`` allows to sample from it,
computes the entropy, the log probability (``log_prob``) and backpropagate the gradient.
In the case of continuous actions, a Gaussian distribution is used. The policy network outputs
mean and (log) std of the distribution (assumed to be a ``DiagGaussianDistribution``).
.. automodule:: stable_baselines3.common.distributions
:members:

View file

@ -0,0 +1,7 @@
.. _env_checker:
Gym Environment Checker
========================
.. automodule:: stable_baselines3.common.env_checker
:members:

View file

@ -0,0 +1,7 @@
.. _eval:
Evaluation Helper
=================
.. automodule:: stable_baselines3.common.evaluation
:members:

View file

@ -216,5 +216,5 @@ texinfo_documents = [
# }
# kornia's hack to get rtd builder to install latest pytorch
# if 'READTHEDOCS' in os.environ:
# os.system('pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html')
if on_rtd:
os.system('pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html')

296
docs/guide/callbacks.rst Normal file
View file

@ -0,0 +1,296 @@
.. _callbacks:
Callbacks
=========
A callback is a set of functions that will be called at given stages of the training procedure.
You can use callbacks to access internal state of the RL model during training.
It allows one to do monitoring, auto saving, model manipulation, progress bars, ...
Custom Callback
---------------
To build a custom callback, you need to create a class that derives from ``BaseCallback``.
This will give you access to events (``_on_training_start``, ``_on_step``) and useful variables (like `self.model` for the RL model).
.. You can find two examples of custom callbacks in the documentation: one for saving the best model according to the training reward (see :ref:`Examples <examples>`), and one for logging additional values with Tensorboard (see :ref:`Tensorboard section <tensorboard>`).
.. code-block:: python
from stable_baselines3.common.callbacks import BaseCallback
class CustomCallback(BaseCallback):
"""
A custom callback that derives from ``BaseCallback``.
:param verbose: (int) Verbosity level 0: not output 1: info 2: debug
"""
def __init__(self, verbose=0):
super(CustomCallback, self).__init__(verbose)
# Those variables will be accessible in the callback
# (they are defined in the base class)
# The RL model
# self.model = None # type: BaseRLModel
# An alias for self.model.get_env(), the environment used for training
# self.training_env = None # type: Union[gym.Env, VecEnv, None]
# Number of time the callback was called
# self.n_calls = 0 # type: int
# self.num_timesteps = 0 # type: int
# local and global variables
# self.locals = None # type: Dict[str, Any]
# self.globals = None # type: Dict[str, Any]
# The logger object, used to report things in the terminal
# self.logger = None # type: logger.Logger
# # Sometimes, for event callback, it is useful
# # to have access to the parent object
# self.parent = None # type: Optional[BaseCallback]
def _on_training_start(self) -> None:
"""
This method is called before the first rollout starts.
"""
pass
def _on_rollout_start(self) -> None:
"""
A rollout is the collection of environment interaction
using the current policy.
This event is triggered before collecting new samples.
"""
pass
def _on_step(self) -> bool:
"""
This method will be called by the model after each call to `env.step()`.
For child callback (of an `EventCallback`), this will be called
when the event is triggered.
:return: (bool) If the callback returns False, training is aborted early.
"""
return True
def _on_rollout_end(self) -> None:
"""
This event is triggered before updating the policy.
"""
pass
def _on_training_end(self) -> None:
"""
This event is triggered before exiting the `learn()` method.
"""
pass
.. note::
``self.num_timesteps`` corresponds to the total number of steps taken in the environment, i.e., it is the number of environments multiplied by the number of time ``env.step()`` was called
For the other algorithms, ``self.num_timesteps`` is incremented by ``n_envs`` (number of environments) after each call to ``env.step()``
.. note::
For off-policy algorithms like SAC, DDPG, TD3 or DQN, the notion of ``rollout`` corresponds to the steps taken in the environment between two updates.
.. _EventCallback:
Event Callback
--------------
Compared to Keras, Stable Baselines provides a second type of ``BaseCallback``, named ``EventCallback`` that is meant to trigger events. When an event is triggered, then a child callback is called.
As an example, :ref:`EvalCallback` is an ``EventCallback`` that will trigger its child callback when there is a new best model.
A child callback is for instance :ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>` that stops the training if the mean reward achieved by the RL model is above a threshold.
.. note::
We recommend to take a look at the source code of :ref:`EvalCallback` and :ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>` to have a better overview of what can be achieved with this kind of callbacks.
.. code-block:: python
class EventCallback(BaseCallback):
"""
Base class for triggering callback on event.
:param callback: (Optional[BaseCallback]) Callback that will be called
when an event is triggered.
:param verbose: (int)
"""
def __init__(self, callback: Optional[BaseCallback] = None, verbose: int = 0):
super(EventCallback, self).__init__(verbose=verbose)
self.callback = callback
# Give access to the parent
if callback is not None:
self.callback.parent = self
...
def _on_event(self) -> bool:
if self.callback is not None:
return self.callback()
return True
Callback Collection
-------------------
Stable Baselines provides you with a set of common callbacks for:
- saving the model periodically (:ref:`CheckpointCallback`)
- evaluating the model periodically and saving the best one (:ref:`EvalCallback`)
- chaining callbacks (:ref:`CallbackList`)
- triggering callback on events (:ref:`EventCallback`, :ref:`EveryNTimesteps`)
- stopping the training early based on a reward threshold (:ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>`)
.. _CheckpointCallback:
CheckpointCallback
^^^^^^^^^^^^^^^^^^
Callback for saving a model every ``save_freq`` steps, you must specify a log folder (``save_path``)
and optionally a prefix for the checkpoints (``rl_model`` by default).
.. code-block:: python
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback
# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
name_prefix='rl_model')
model = SAC('MlpPolicy', 'Pendulum-v0')
model.learn(2000, callback=checkpoint_callback)
.. _EvalCallback:
EvalCallback
^^^^^^^^^^^^
Evaluate periodically the performance of an agent, using a separate test environment.
It will save the best model if ``best_model_save_path`` folder is specified and save the evaluations results in a numpy archive (`evaluations.npz`) if ``log_path`` folder is specified.
.. note::
You can pass a child callback via the ``callback_on_new_best`` argument. It will be triggered each time there is a new best model.
.. code-block:: python
import gym
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback
# Separate evaluation env
eval_env = gym.make('Pendulum-v0')
# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/',
log_path='./logs/', eval_freq=500,
deterministic=True, render=False)
model = SAC('MlpPolicy', 'Pendulum-v0')
model.learn(5000, callback=eval_callback)
.. _Callbacklist:
CallbackList
^^^^^^^^^^^^
Class for chaining callbacks, they will be called sequentially.
Alternatively, you can pass directly a list of callbacks to the `learn()` method, it will be converted automatically to a ``CallbackList``.
.. code-block:: python
import gym
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/')
# Separate evaluation env
eval_env = gym.make('Pendulum-v0')
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model',
log_path='./logs/results', eval_freq=500)
# Create the callback list
callback = CallbackList([checkpoint_callback, eval_callback])
model = SAC('MlpPolicy', 'Pendulum-v0')
# Equivalent to:
# model.learn(5000, callback=[checkpoint_callback, eval_callback])
model.learn(5000, callback=callback)
.. _StopTrainingCallback:
StopTrainingOnRewardThreshold
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Stop the training once a threshold in episodic reward (mean episode reward over the evaluations) has been reached (i.e., when the model is good enough).
It must be used with the :ref:`EvalCallback` and use the event triggered by a new best model.
.. code-block:: python
import gym
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
# Separate evaluation env
eval_env = gym.make('Pendulum-v0')
# Stop training when the model reaches the reward threshold
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1)
eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1)
model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1)
# Almost infinite number of timesteps, but the training will stop
# early as soon as the reward threshold is reached
model.learn(int(1e10), callback=eval_callback)
.. _EveryNTimesteps:
EveryNTimesteps
^^^^^^^^^^^^^^^
An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` timesteps.
.. note::
Because of the way ``PPO1`` and ``TRPO`` work (they rely on MPI), ``n_steps`` is a lower bound between two events.
.. code-block:: python
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback, EveryNTimesteps
# this is equivalent to defining CheckpointCallback(save_freq=500)
# checkpoint_callback will be triggered every 500 steps
checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/')
event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)
model = PPO('MlpPolicy', 'Pendulum-v0', verbose=1)
model.learn(int(2e4), callback=event_callback)
.. automodule:: stable_baselines3.common.callbacks
:members:

164
docs/guide/checking_nan.rst Normal file
View file

@ -0,0 +1,164 @@
Dealing with NaNs and infs
==========================
During the training of a model on a given environment, it is possible that the RL model becomes completely
corrupted when a NaN or an inf is given or returned from the RL model.
How and why?
------------
The issue arises then NaNs or infs do not crash, but simply get propagated through the training,
until all the floating point number converge to NaN or inf. This is in line with the
`IEEE Standard for Floating-Point Arithmetic (IEEE 754) <https://ieeexplore.ieee.org/document/4610935>`_ standard, as it says:
.. note::
Five possible exceptions can occur:
- Invalid operation (:math:`\sqrt{-1}`, :math:`\inf \times 1`, :math:`\text{NaN}\ \mathrm{mod}\ 1`, ...) return NaN
- Division by zero:
- if the operand is not zero (:math:`1/0`, :math:`-2/0`, ...) returns :math:`\pm\inf`
- if the operand is zero (:math:`0/0`) returns signaling NaN
- Overflow (exponent too high to represent) returns :math:`\pm\inf`
- Underflow (exponent too low to represent) returns :math:`0`
- Inexact (not representable exactly in base 2, eg: :math:`1/5`) returns the rounded value (ex: :code:`assert (1/5) * 3 == 0.6000000000000001`)
And of these, only ``Division by zero`` will signal an exception, the rest will propagate invalid values quietly.
In python, dividing by zero will indeed raise the exception: ``ZeroDivisionError: float division by zero``,
but ignores the rest.
The default in numpy, will warn: ``RuntimeWarning: invalid value encountered``
but will not halt the code.
Anomaly detection with PyTorch
------------------------------
To enable NaN detection in PyTorch you can do
.. code-block:: python
import torch as th
th.autograd.set_detect_anomaly(True)
Numpy parameters
----------------
Numpy has a convenient way of dealing with invalid value: `numpy.seterr <https://docs.scipy.org/doc/numpy/reference/generated/numpy.seterr.html>`_,
which defines for the python process, how it should handle floating point error.
.. code-block:: python
import numpy as np
np.seterr(all='raise') # define before your code.
print("numpy test:")
a = np.float64(1.0)
b = np.float64(0.0)
val = a / b # this will now raise an exception instead of a warning.
print(val)
but this will also avoid overflow issues on floating point numbers:
.. code-block:: python
import numpy as np
np.seterr(all='raise') # define before your code.
print("numpy overflow test:")
a = np.float64(10)
b = np.float64(1000)
val = a ** b # this will now raise an exception
print(val)
but will not avoid the propagation issues:
.. code-block:: python
import numpy as np
np.seterr(all='raise') # define before your code.
print("numpy propagation test:")
a = np.float64('NaN')
b = np.float64(1.0)
val = a + b # this will neither warn nor raise anything
print(val)
VecCheckNan Wrapper
-------------------
In order to find when and from where the invalid value originated from, stable-baselines3 comes with a ``VecCheckNan`` wrapper.
It will monitor the actions, observations, and rewards, indicating what action or observation caused it and from what.
.. code-block:: python
import gym
from gym import spaces
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan
class NanAndInfEnv(gym.Env):
"""Custom Environment that raised NaNs and Infs"""
metadata = {'render.modes': ['human']}
def __init__(self):
super(NanAndInfEnv, self).__init__()
self.action_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64)
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64)
def step(self, _action):
randf = np.random.rand()
if randf > 0.99:
obs = float('NaN')
elif randf > 0.98:
obs = float('inf')
else:
obs = randf
return [obs], 0.0, False, {}
def reset(self):
return [0.0]
def render(self, mode='human', close=False):
pass
# Create environment
env = DummyVecEnv([lambda: NanAndInfEnv()])
env = VecCheckNan(env, raise_exception=True)
# Instantiate the agent
model = PPO('MlpPolicy', env)
# Train the agent
model.learn(total_timesteps=int(2e5)) # this will crash explaining that the invalid value originated from the environment.
RL Model hyperparameters
------------------------
Depending on your hyperparameters, NaN can occurs much more often.
A great example of this: https://github.com/hill-a/stable-baselines/issues/340
Be aware, the hyperparameters given by default seem to work in most cases,
however your environment might not play nice with them.
If this is the case, try to read up on the effect each hyperparameters has on the model,
so that you can try and tune them to get a stable model. Alternatively, you can try automatic hyperparameter tuning (included in the rl zoo).
Missing values from datasets
----------------------------
If your environment is generated from an external dataset, do not forget to make sure your dataset does not contain NaNs.
As some datasets will sometimes fill missing values with NaNs as a surrogate value.
Here is some reading material about finding NaNs: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
And filling the missing values with something else (imputation): https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4

82
docs/guide/custom_env.rst Normal file
View file

@ -0,0 +1,82 @@
.. _custom_env:
Using Custom Environments
==========================
To use the rl baselines with custom environments, they just need to follow the *gym* interface.
That is to say, your environment must implement the following methods (and inherits from OpenAI Gym Class):
.. note::
If you are using images as input, the input values must be in [0, 255] as the observation
is normalized (dividing by 255 to have values in [0, 1]) when using CNN policies.
.. code-block:: python
import gym
from gym import spaces
class CustomEnv(gym.Env):
"""Custom Environment that follows gym interface"""
metadata = {'render.modes': ['human']}
def __init__(self, arg1, arg2, ...):
super(CustomEnv, self).__init__()
# Define action and observation space
# They must be gym.spaces objects
# Example when using discrete actions:
self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
# Example for using image as input:
self.observation_space = spaces.Box(low=0, high=255,
shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)
def step(self, action):
...
return observation, reward, done, info
def reset(self):
...
return observation # reward, done, info can't be included
def render(self, mode='human'):
...
def close (self):
...
Then you can define and train a RL agent with:
.. code-block:: python
# Instantiate the env
env = CustomEnv(arg1, ...)
# Define and Train the agent
model = A2C('CnnPolicy', env).learn(total_timesteps=1000)
To check that your environment follows the gym interface, please use:
.. code-block:: python
from stable_baselines3.common.env_checker import check_env
env = CustomEnv(arg1, ...)
# It will check your custom environment and output additional warnings if needed
check_env(env)
We have created a `colab notebook <https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb>`_ for
a concrete example of creating a custom environment.
You can also find a `complete guide online <https://github.com/openai/gym/blob/master/docs/creating-environments.md>`_
on creating a custom Gym environment.
Optionally, you can also register the environment with gym,
that will allow you to create the RL agent in one line (and use ``gym.make()`` to instantiate the env).
In the project, for testing purposes, we use a custom environment named ``IdentityEnv``
defined `in this file <https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/identity_env.py>`_.
An example of how to use it can be found `here <https://github.com/hill-a/stable-baselines/blob/master/tests/test_identity.py>`_.

152
docs/guide/install.rst Normal file
View file

@ -0,0 +1,152 @@
.. _install:
Installation
============
Prerequisites
-------------
Stable-Baselines3 requires python 3.6+.
Windows 10
~~~~~~~~~~
We recommend using `Anaconda <https://conda.io/docs/user-guide/install/windows.html>`_ for Windows users for easier installation of Python packages and required libraries. You need an environment with Python version 3.6 or above.
For a quick start you can move straight to installing Stable-Baselines3 in the next step.
.. note::
Trying to create Atari environments may result to vague errors related to missing DLL files and modules. This is an
issue with atari-py package. `See this discussion for more information <https://github.com/openai/atari-py/issues/65>`_.
Stable Release
~~~~~~~~~~~~~~
To install Stable Baselines3 with pip, execute:
.. code-block:: bash
pip install stable-baselines3[extra]
This includes an optional dependency OpenCV to display the environments when using ``SubprocVecEnv``. If you do not need it, you can install without OpenCV:
.. code-block:: bash
pip install stable-baselines3
Bleeding-edge version
---------------------
.. code-block:: bash
pip install git+https://github.com/DLR-RM/stable-baselines3
Development verion
------------------
To contribute to Stable-Baselines3, with support for running tests and building the documentation.
.. code-block:: bash
git clone https://github.com/DLR-RM/stable-baselines3 && cd stable-baselines3
pip install -e .[docs,tests,extra]
.. Using Docker Images
.. -------------------
..
.. If you are looking for docker images with stable-baselines already installed in it,
.. we recommend using images from `RL Baselines3 Zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_.
..
.. Otherwise, the following images contained all the dependencies for stable-baselines3 but not the stable-baselines3 package itself.
.. They are made for development.
..
.. Use Built Images
.. ~~~~~~~~~~~~~~~~
..
.. GPU image (requires `nvidia-docker`_):
..
.. .. code-block:: bash
..
.. docker pull stablebaselines/stable-baselines3
..
.. CPU only:
..
.. .. code-block:: bash
..
.. docker pull stablebaselines/stable-baselines3-cpu
..
.. Build the Docker Images
.. ~~~~~~~~~~~~~~~~~~~~~~~~
..
.. Build GPU image (with nvidia-docker):
..
.. .. code-block:: bash
..
.. make docker-gpu
..
.. Build CPU image:
..
.. .. code-block:: bash
..
.. make docker-cpu
..
.. Note: if you are using a proxy, you need to pass extra params during
.. build and do some `tweaks`_:
..
.. .. code-block:: bash
..
.. --network=host --build-arg HTTP_PROXY=http://your.proxy.fr:8080/ --build-arg http_proxy=http://your.proxy.fr:8080/ --build-arg HTTPS_PROXY=https://your.proxy.fr:8080/ --build-arg https_proxy=https://your.proxy.fr:8080/
..
.. Run the images (CPU/GPU)
.. ~~~~~~~~~~~~~~~~~~~~~~~~
..
.. Run the nvidia-docker GPU image
..
.. .. code-block:: bash
..
.. docker run -it --runtime=nvidia --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines bash -c 'cd /root/code/stable-baselines/ && pytest tests/'
..
.. Or, with the shell file:
..
.. .. code-block:: bash
..
.. ./scripts/run_docker_gpu.sh pytest tests/
..
.. Run the docker CPU image
..
.. .. code-block:: bash
..
.. docker run -it --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines-cpu bash -c 'cd /root/code/stable-baselines/ && pytest tests/'
..
.. Or, with the shell file:
..
.. .. code-block:: bash
..
.. ./scripts/run_docker_cpu.sh pytest tests/
..
.. Explanation of the docker command:
..
.. - ``docker run -it`` create an instance of an image (=container), and
.. run it interactively (so ctrl+c will work)
.. - ``--rm`` option means to remove the container once it exits/stops
.. (otherwise, you will have to use ``docker rm``)
.. - ``--network host`` don't use network isolation, this allow to use
.. tensorboard/visdom on host machine
.. - ``--ipc=host`` Use the host systems IPC namespace. IPC (POSIX/SysV IPC) namespace provides
.. separation of named shared memory segments, semaphores and message
.. queues.
.. - ``--name test`` give explicitly the name ``test`` to the container,
.. otherwise it will be assigned a random name
.. - ``--mount src=...`` give access of the local directory (``pwd``
.. command) to the container (it will be map to ``/root/code/stable-baselines``), so
.. all the logs created in the container in this folder will be kept
.. - ``bash -c '...'`` Run command inside the docker image, here run the tests
.. (``pytest tests/``)
..
.. .. _nvidia-docker: https://github.com/NVIDIA/nvidia-docker
.. .. _tweaks: https://stackoverflow.com/questions/23111631/cannot-download-docker-images-behind-a-proxy

12
docs/guide/migration.rst Normal file
View file

@ -0,0 +1,12 @@
.. _migration:
================================
Migrating from Stable-Baselines
================================
This is a guide to migrate from Stable-Baselines to Stable-Baselines3.
It also references the main changes.
**TODO**

View file

@ -6,26 +6,27 @@ Getting Started
Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms.
Here is a quick example of how to train and run SAC on a Pendulum environment:
Here is a quick example of how to train and run A2C on a CartPole environment:
.. code-block:: python
import gym
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import SAC
from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy
env = gym.make('Pendulum-v0')
env = gym.make('CartPole-v1')
model = SAC(MlpPolicy, env, verbose=1)
model = A2C(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=10000)
obs = env.reset()
for i in range(1000):
action = model.predict(obs)
obs, rewards, dones, info = env.step(action)
action, _state = model.predict(obs, deterministic=True)
obs, reward, done, info = env.step(action)
env.render()
if done:
obs = env.reset()
Or just train a model with a one liner if
@ -34,6 +35,6 @@ the policy is registered:
.. code-block:: python
from stable_baselines3 import SAC
from stable_baselines3 import A2C
model = SAC('MlpPolicy', 'Pendulum-v0').learn(10000)
model = A2C('MlpPolicy', 'CartPole-v1').learn(10000)

17
docs/guide/rl.rst Normal file
View file

@ -0,0 +1,17 @@
.. _rl:
================================
Reinforcement Learning Resources
================================
Stable-Baselines3 assumes that you already understand the basic concepts of Reinforcement Learning (RL).
However, if you want to learn about RL, there are several good resources to get started:
- `OpenAI Spinning Up <https://spinningup.openai.com/en/latest/>`_
- `David Silver's course <http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html>`_
- `Lilian Weng's blog <https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html>`_
- `Berkeley's Deep RL Bootcamp <https://sites.google.com/view/deep-rl-bootcamp/lectures>`_
- `Berkeley's Deep Reinforcement Learning course <http://rail.eecs.berkeley.edu/deeprlcourse/>`_
- `More resources <https://github.com/dennybritz/reinforcement-learning>`_

251
docs/guide/rl_tips.rst Normal file
View file

@ -0,0 +1,251 @@
.. _rl_tips:
======================================
Reinforcement Learning Tips and Tricks
======================================
The aim of this section is to help you doing reinforcement learning experiments.
It covers general advice about RL (where to start, which algorithm to choose, how to evaluate an algorithm, ...),
as well as tips and tricks when using a custom environment or implementing an RL algorithm.
General advice when using Reinforcement Learning
================================================
TL;DR
-----
1. Read about RL and Stable Baselines
2. Do quantitative experiments and hyperparameter tuning if needed
3. Evaluate the performance using a separate test environment
4. For better performance, increase the training budget
Like any other subject, if you want to work with RL, you should first read about it (we have a dedicated `resource page <rl.html>`_ to get you started)
to understand what you are using. We also recommend you read Stable Baselines (SB) documentation and do the `tutorial <https://github.com/araffin/rl-tutorial-jnrr19>`_.
It covers basic usage and guide you towards more advanced concepts of the library (e.g. callbacks and wrappers).
Reinforcement Learning differs from other machine learning methods in several ways. The data used to train the agent is collected
through interactions with the environment by the agent itself (compared to supervised learning where you have a fixed dataset for instance).
This dependence can lead to vicious circle: if the agent collects poor quality data (e.g., trajectories with no rewards), then it will not improve and continue to amass
bad trajectories.
This factor, among others, explains that results in RL may vary from one run to another (i.e., when only the seed of the pseudo-random generator changes).
For this reason, you should always do several runs to have quantitative results.
Good results in RL are generally dependent on finding appropriate hyperparameters. Recent algorithms (PPO, SAC, TD3) normally require little hyperparameter tuning,
however, *don't expect the default ones to work* on any environment.
Therefore, we *highly recommend you* to take a look at the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_ (or the original papers) for tuned hyperparameters.
A best practice when you apply RL to a new problem is to do automatic hyperparameter optimization. Again, this is included in the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_.
When applying RL to a custom problem, you should always normalize the input to the agent (e.g. using VecNormalize for PPO2/A2C)
and look at common preprocessing done on other environments (e.g. for `Atari <https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/>`_, frame-stack, ...).
Please refer to *Tips and Tricks when creating a custom environment* paragraph below for more advice related to custom environments.
Current Limitations of RL
-------------------------
You have to be aware of the current `limitations <https://www.alexirpan.com/2018/02/14/rl-hard.html>`_ of reinforcement learning.
Model-free RL algorithms (i.e. all the algorithms implemented in SB) are usually *sample inefficient*. They require a lot of samples (sometimes millions of interactions) to learn something useful.
That's why most of the successes in RL were achieved on games or in simulation only. For instance, in this `work <https://www.youtube.com/watch?v=aTDkYFZFWug>`_ by ETH Zurich, the ANYmal robot was trained in simulation only, and then tested in the real world.
As a general advice, to obtain better performances, you should augment the budget of the agent (number of training timesteps).
In order to achieve the desired behavior, expert knowledge is often required to design an adequate reward function.
This *reward engineering* (or *RewArt* as coined by `Freek Stulp <http://www.freekstulp.net/>`_), necessitates several iterations. As a good example of reward shaping,
you can take a look at `Deep Mimic paper <https://xbpeng.github.io/projects/DeepMimic/index.html>`_ which combines imitation learning and reinforcement learning to do acrobatic moves.
One last limitation of RL is the instability of training. That is to say, you can observe during training a huge drop in performance.
This behavior is particularly present in ``DDPG``, that's why its extension ``TD3`` tries to tackle that issue.
Other method, like ``TRPO`` or ``PPO`` make use of a *trust region* to minimize that problem by avoiding too large update.
How to evaluate an RL algorithm?
--------------------------------
Because most algorithms use exploration noise during training, you need a separate test environment to evaluate the performance
of your agent at a given time. It is recommended to periodically evaluate your agent for ``n`` test episodes (``n`` is usually between 5 and 20)
and average the reward per episode to have a good estimate.
As some policy are stochastic by default (e.g. A2C or PPO), you should also try to set `deterministic=True` when calling the `.predict()` method,
this frequently leads to better performance.
Looking at the training curve (episode reward function of the timesteps) is a good proxy but underestimates the agent true performance.
.. note::
We provide an ``EvalCallback`` for doing such evaluation. You can read more about it in the :ref:`Callbacks <callbacks>` section.
We suggest you reading `Deep Reinforcement Learning that Matters <https://arxiv.org/abs/1709.06560>`_ for a good discussion about RL evaluation.
You can also take a look at this `blog post <https://openlab-flowers.inria.fr/t/how-many-random-seeds-should-i-use-statistical-power-analysis-in-deep-reinforcement-learning-experiments/457>`_
and this `issue <https://github.com/hill-a/stable-baselines/issues/199>`_ by Cédric Colas.
Which algorithm should I use?
=============================
There is no silver bullet in RL, depending on your needs and problem, you may choose one or the other.
The first distinction comes from your action space, i.e., do you have discrete (e.g. LEFT, RIGHT, ...)
or continuous actions (ex: go to a certain speed)?
Some algorithms are only tailored for one or the other domain: ``DQN`` only supports discrete actions, where ``SAC`` is restricted to continuous actions.
The second difference that will help you choose is whether you can parallelize your training or not.
If what matters is the wall clock training time, then you should lean towards ``A2C`` and its derivatives (PPO, ...).
Take a look at the `Vectorized Environments <vec_envs.html>`_ to learn more about training with multiple workers.
To sum it up:
Discrete Actions
----------------
.. note::
This covers ``Discrete``, ``MultiDiscrete``, ``Binary`` and ``MultiBinary`` spaces
Discrete Actions - Single Process
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
DQN with extensions (double DQN, prioritized replay, ...) are the recommended algorithms.
DQN is usually slower to train (regarding wall clock time) but is the most sample efficient (because of its replay buffer).
Discrete Actions - Multiprocessed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You should give a try to PPO or A2C.
Continuous Actions
------------------
Continuous Actions - Single Process
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Current State Of The Art (SOTA) algorithms are ``SAC`` and ``TD3``.
Please use the hyperparameters in the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_ for best results.
Continuous Actions - Multiprocessed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Take a look at PPO2, TRPO or A2C. Again, don't forget to take the hyperparameters from the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_
for continuous actions problems (cf *Bullet* envs).
.. note::
Normalization is critical for those algorithms
.. Goal Environment
.. -----------------
..
.. If your environment follows the ``GoalEnv`` interface (cf `HER <../modules/her.html>`_), then you should use
.. HER + (SAC/TD3/DDPG/DQN) depending on the action space.
..
..
.. .. note::
..
.. The number of workers is an important hyperparameters for experiments with HER
..
Tips and Tricks when creating a custom environment
==================================================
If you want to learn about how to create a custom environment, we recommend you read this `page <custom_env.html>`_.
We also provide a `colab notebook <https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb>`_ for
a concrete example of creating a custom gym environment.
Some basic advice:
- always normalize your observation space when you can, i.e., when you know the boundaries
- normalize your action space and make it symmetric when continuous (cf potential issue below) A good practice is to rescale your actions to lie in [-1, 1]. This does not limit you as you can easily rescale the action inside the environment
- start with shaped reward (i.e. informative reward) and simplified version of your problem
- debug with random actions to check that your environment works and follows the gym interface:
We provide a helper to check that your environment runs without error:
.. code-block:: python
from stable_baselines3.common.env_checker import check_env
env = CustomEnv(arg1, ...)
# It will check your custom environment and output additional warnings if needed
check_env(env)
If you want to quickly try a random agent on your environment, you can also do:
.. code-block:: python
env = YourEnv()
obs = env.reset()
n_steps = 10
for _ in range(n_steps):
# Random action
action = env.action_space.sample()
obs, reward, done, info = env.step(action)
if done:
obs = env.reset()
**Why should I normalize the action space?**
Most reinforcement learning algorithms rely on a Gaussian distribution (initially centered at 0 with std 1) for continuous actions.
So, if you forget to normalize the action space when using a custom environment,
this can harm learning and be difficult to debug (cf attached image and `issue #473 <https://github.com/hill-a/stable-baselines/issues/473>`_).
.. figure:: ../_static/img/mistake.png
Another consequence of using a Gaussian is that the action range is not bounded.
That's why clipping is usually used as a bandage to stay in a valid interval.
A better solution would be to use a squashing function (cf ``SAC``) or a Beta distribution (cf `issue #112 <https://github.com/hill-a/stable-baselines/issues/112>`_).
.. note::
This statement is not true for ``DDPG`` or ``TD3`` because they don't rely on any probability distribution.
Tips and Tricks when implementing an RL algorithm
=================================================
When you try to reproduce a RL paper by implementing the algorithm, the `nuts and bolts of RL research <http://joschu.net/docs/nuts-and-bolts.pdf>`_
by John Schulman are quite useful (`video <https://www.youtube.com/watch?v=8EcdaCk9KaQ>`_).
We *recommend following those steps to have a working RL algorithm*:
1. Read the original paper several times
2. Read existing implementations (if available)
3. Try to have some "sign of life" on toy problems
4. Validate the implementation by making it run on harder and harder envs (you can compare results against the RL zoo)
You usually need to run hyperparameter optimization for that step.
You need to be particularly careful on the shape of the different objects you are manipulating (a broadcast mistake will fail silently cf `issue #75 <https://github.com/hill-a/stable-baselines/pull/76>`_)
and when to stop the gradient propagation.
A personal pick (by @araffin) for environments with gradual difficulty in RL with continuous actions:
1. Pendulum (easy to solve)
2. HalfCheetahBullet (medium difficulty with local minima and shaped reward)
3. BipedalWalkerHardcore (if it works on that one, then you can have a cookie)
in RL with discrete actions:
1. CartPole-v1 (easy to be better than random agent, harder to achieve maximal performance)
2. LunarLander
3. Pong (one of the easiest Atari game)
4. other Atari games (e.g. Breakout)

View file

@ -6,11 +6,11 @@ Vectorized Environments
=======================
Vectorized Environments are a method for stacking multiple independent environments into a single environment.
Instead of training an RL agent on 1 environment per step, it allows us to train it on `n` environments per step.
Because of this, `actions` passed to the environment are now a vector (of dimension `n`).
It is the same for `observations`, `rewards` and end of episode signals (`dones`).
In the case of non-array observation spaces such as `Dict` or `Tuple`, where different sub-spaces
may have different shapes, the sub-observations are vectors (of dimension `n`).
Instead of training an RL agent on 1 environment per step, it allows us to train it on ``n`` environments per step.
Because of this, ``actions`` passed to the environment are now a vector (of dimension ``n``).
It is the same for ``observations``, ``rewards`` and end of episode signals (``dones``).
In the case of non-array observation spaces such as ``Dict`` or ``Tuple``, where different sub-spaces
may have different shapes, the sub-observations are vectors (of dimension ``n``).
============= ======= ============ ======== ========= ================
Name ``Box`` ``Discrete`` ``Dict`` ``Tuple`` Multi Processing
@ -27,7 +27,7 @@ SubprocVecEnv ✔️ ✔️ ✔️ ✔️ ✔️
When using vectorized environments, the environments are automatically reset at the end of each episode.
Thus, the observation returned for the i-th environment when ``done[i]`` is true will in fact be the first observation of the next episode, not the last observation of the episode that has just terminated.
You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the `VecEnv`.
You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the vecenv.
.. warning::
@ -69,3 +69,24 @@ VecNormalize
.. autoclass:: VecNormalize
:members:
VecVideoRecorder
~~~~~~~~~~~~~~~~
.. autoclass:: VecVideoRecorder
:members:
VecCheckNan
~~~~~~~~~~~~~~~~
.. autoclass:: VecCheckNan
:members:
VecTransposeImage
~~~~~~~~~~~~~~~~~
.. autoclass:: VecTransposeImage
:members:

View file

@ -1,4 +1,4 @@
.. Stable Baselines documentation master file, created by
.. Stable Baselines3 documentation master file, created by
sphinx-quickstart on Thu Sep 26 11:06:54 2019.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
@ -6,21 +6,41 @@
Welcome to Stable Baselines3 docs!
==================================
`Stable Baselines3 <https://github.com/DLR-RM/stable-baselines3>`_ is the next major version (PyTorch edition) of `Stable Baselines <https://github.com/hill-a/stable-baselines>`_,
a set of improved implementations of reinforcement learning algorithms.
`Stable Baselines3 <https://github.com/DLR-RM/stable-baselines3>`_ is a set of improved implementations of reinforcement learning algorithms in PyTorch.
It is the next major version (PyTorch edition) of `Stable Baselines <https://github.com/hill-a/stable-baselines>`_.
Github repository: https://github.com/DLR-RM/stable-baselines3
RL Baselines3 Zoo (collection of pre-trained agents): https://github.com/DLR-RM/rl-baselines3-zoo
RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and do hyperparameter tuning.
Main Features
--------------
- Unified structure for all algorithms
- PEP8 compliant (unified code style)
- Documented functions and classes
- Tests, high code coverage and type hints
- Clean code
.. toctree::
:maxdepth: 2
:caption: User Guide
guide/install
guide/quickstart
guide/rl_tips
guide/rl
guide/vec_envs
guide/custom_env
guide/callbacks
guide/migration
guide/checking_nan
.. toctree::
@ -33,12 +53,20 @@ RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and d
modules/sac
modules/td3
.. toctree::
:maxdepth: 1
:caption: Common
common/distributions
common/evaluation
common/env_checker
.. toctree::
:maxdepth: 1
:caption: Misc
misc/changelog
misc/projects
Citing Stable Baselines3

View file

@ -29,6 +29,7 @@ Others:
Documentation:
^^^^^^^^^^^^^^
- Added most documentation (adapted from Stable-Baselines)
Pre-Release 0.5.0 (2020-05-05)

26
docs/misc/projects.rst Normal file
View file

@ -0,0 +1,26 @@
.. _projects:
Projects
=========
This is a list of projects using stable-baselines3.
Please tell us, if you want your project to appear on this page ;)
.. RL Racing Robot
.. --------------------------
.. Implementation of reinforcement learning approach to make a donkey car learn to race.
.. Uses SAC on autoencoder features
..
.. | Author: Antonin Raffin (@araffin)
.. | Github repo: https://github.com/araffin/RL-Racing-Robot
.. Generalized State Dependent Exploration for Deep Reinforcement Learning in Robotics
.. -----------------------------------------------------------------------------------
..
.. An exploration method to train RL agent directly on real robots.
..
.. | Author: Antonin Raffin, Freek Stulp
.. | Github: https://github.com/DLR-RM/stable-baselines3/tree/sde
.. | Paper:

View file

@ -38,15 +38,15 @@ MultiBinary ❌ ❌
Example
-------
Train a A2C agent on `CartPole-v1` using 4 processes.
Train a A2C agent on ``CartPole-v1`` using 4 environments.
.. code-block:: python
import gym
from stable_baselines3.common.policies import MlpPolicy
from stable_baselines3.common import make_vec_env
from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy
from stable_baselines3.common.cmd_utils import make_vec_env
# Parallel environments
env = make_vec_env('CartPole-v1', n_envs=4)

View file

@ -10,3 +10,12 @@ Common interface for all the RL algorithms
.. autoclass:: BaseRLModel
:members:
Base RL Class
=============
The base RL model for Off-Policy algorithm (ex: SAC/TD3)
.. autoclass:: OffPolicyRLModel
:members:

View file

@ -47,34 +47,32 @@ MultiBinary ❌ ❌
Example
-------
Train a PPO agent on `Pendulum-v0` using 4 processes.
Train a PPO agent on ``Pendulum-v0`` using 4 environments.
.. code-block:: python
import gym
import gym
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.cmd_utils import make_vec_env
# multiprocess environment
n_cpu = 4
env = SubprocVecEnv([lambda: gym.make('Pendulum-v0') for i in range(n_cpu)])
# Parallel environments
env = make_vec_env('CartPole-v1', n_envs=4)
model = PPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("ppo2_cartpole")
model = PPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("ppo_cartpole")
del model # remove to demonstrate saving and loading
del model # remove to demonstrate saving and loading
model = PPO.load("ppo2_cartpole")
model = PPO.load("ppo_cartpole")
# Enjoy trained agent
obs = env.reset()
while True:
action, _states = model.predict(obs)
obs, rewards, dones, info = env.step(action)
env.render()
obs = env.reset()
while True:
action, _states = model.predict(obs)
obs, rewards, dones, info = env.step(action)
env.render()
Parameters
----------

View file

@ -14,7 +14,7 @@ A key feature of SAC, and a major difference with common RL algorithms, is that
.. warning::
The SAC model does not support ``stable_baselines3.common.policies`` because it uses double q-values
The SAC model does not support ``stable_baselines3.ppo.policies`` because it uses double q-values
and value estimation, as a result it must use its own policy models (see :ref:`sac_policies`).
@ -24,6 +24,7 @@ A key feature of SAC, and a major difference with common RL algorithms, is that
:nosignatures:
MlpPolicy
CnnPolicy
Notes
@ -72,15 +73,13 @@ Example
import gym
import numpy as np
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import SAC
from stable_baselines3.sac import MlpPolicy
env = gym.make('Pendulum-v0')
env = DummyVecEnv([lambda: env])
model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=50000, log_interval=10)
model.learn(total_timesteps=10000, log_interval=4)
model.save("sac_pendulum")
del model # remove to demonstrate saving and loading
@ -90,8 +89,10 @@ Example
obs = env.reset()
while True:
action, _states = model.predict(obs)
obs, rewards, dones, info = env.step(action)
obs, reward, done, info = env.step(action)
env.render()
if done:
obs = env.reset()
Parameters
----------
@ -108,3 +109,7 @@ SAC Policies
.. autoclass:: MlpPolicy
:members:
:inherited-members:
.. .. autoclass:: CnnPolicy
.. :members:
.. :inherited-members:

View file

@ -14,7 +14,7 @@ We recommend reading `OpenAI Spinning guide on TD3 <https://spinningup.openai.co
.. warning::
The TD3 model does not support ``stable_baselines3.common.policies`` because it uses double q-values
The TD3 model does not support ``stable_baselines3.ppo.policies`` because it uses double q-values
estimation, as a result it must use its own policy models (see :ref:`td3_policies`).
@ -73,7 +73,7 @@ Example
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
model = TD3(MlpPolicy, 'Pendulum-v0', action_noise=action_noise, verbose=1)
model.learn(total_timesteps=50000, log_interval=10)
model.learn(total_timesteps=10000, log_interval=10)
model.save("td3_pendulum")
env = model.get_env()
@ -87,6 +87,7 @@ Example
obs, rewards, dones, info = env.step(action)
env.render()
Parameters
----------
@ -102,3 +103,8 @@ TD3 Policies
.. autoclass:: MlpPolicy
:members:
:inherited-members:
.. .. autoclass:: CnnPolicy
.. :members:
.. :inherited-members:

View file

@ -50,8 +50,10 @@ model.learn(total_timesteps=10000)
obs = env.reset()
for i in range(1000):
action, _states = model.predict(obs, deterministic=True)
obs, rewards, dones, info = env.step(action)
obs, reward, done, info = env.step(action)
env.render()
if done:
obs = env.reset()
```
Or just train a model with a one liner if [the environment is registered in Gym](https://github.com/openai/gym/wiki/Environments) and if [the policy is registered](https://stable-baselines.readthedocs.io/en/master/guide/custom_policy.html):

View file

@ -1,2 +1,2 @@
from stable_baselines3.a2c.a2c import A2C
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.ppo.policies import MlpPolicy, CnnPolicy

View file

@ -1,2 +1,2 @@
from stable_baselines3.ppo.ppo import PPO
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.ppo.policies import MlpPolicy, CnnPolicy

View file

@ -1,2 +1,2 @@
from stable_baselines3.sac.sac import SAC
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3.sac.policies import MlpPolicy, CnnPolicy

View file

@ -1,2 +1,2 @@
from stable_baselines3.td3.td3 import TD3
from stable_baselines3.td3.policies import MlpPolicy
from stable_baselines3.td3.policies import MlpPolicy, CnnPolicy