Add base doc

2026-07-16 18:31:44 +00:00 · 2020-05-07 10:10:51 +02:00 · 2020-05-07 10:10:51 +02:00 · d17f29c8ad
commit d17f29c8ad
parent 94b1267817
29 changed files with 1194 additions and 58 deletions
--- a/README.md
+++ b/README.md
@ -35,6 +35,29 @@ These algorithms will make it easier for the research community and industry to

 <!-- | Tensorboard support         | :heavy_check_mark: | -->

+### Roadmap to V1.0
+
+Please look at the issue for more details.
+Planned features:
+
+- [ ] DQN (almost ready, currently in testing phase)
+- [ ] DDPG (you can use its successor TD3 for now)
+- [ ] HER
+- [ ] Support for MultiDiscrete and MultiBinary action spaces
+
+### Planned features (v1.1+)
+
+- [ ] Full Tensorboard support
+- [ ] DQN extensions (prioritized replay, double q-learning, ...)
+- [ ] Support for `Tuple` and `Dict` observation spaces
+- [ ] Recurrent Policies
+- [ ] TRPO
+
+
+## Migration guide
+
+**TODO: migration guide from Stable-Baselines in the documentation**
+
 ## Documentation

 Documentation is available online: [https://stable-baselines.readthedocs.io/](https://stable-baselines.readthedocs.io/)
@ -102,8 +125,10 @@ model.learn(total_timesteps=10000)
 obs = env.reset()
 for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
-    obs, rewards, dones, info = env.step(action)
+    obs, reward, done, info = env.step(action)
    env.render()
+    if done:
+      obs = env.reset()

 env.close()
 ```
--- a/docs/README.md
+++ b/docs/README.md
@ -1,4 +1,4 @@
-## Stable Baselines Documentation
+## Stable Baselines3 Documentation

 This folder contains documentation for the RL baselines.

--- a/docs/_static/img/mistake.png
+++ b/docs/_static/img/mistake.png
--- a/docs/common/distributions.rst
+++ b/docs/common/distributions.rst
@ -0,0 +1,26 @@
+.. _distributions:
+
+Probability Distributions
+=========================
+
+Probability distributions used for the different action spaces:
+
+- ``CategoricalDistribution`` -> Discrete
+- ``DiagGaussianDistribution`` -> Box (continuous actions)
+- ``StateDependentNoiseDistribution`` -> Box (continuous actions) when ``use_sde=True``
+
+.. - ``MultiCategoricalDistribution`` -> MultiDiscrete
+.. - ``BernoulliDistribution`` -> MultiBinary
+
+The policy networks output parameters for the distributions (named ``flat`` in the methods).
+Actions are then sampled from those distributions.
+
+For instance, in the case of discrete actions. The policy network outputs probability
+of taking each action. The ``CategoricalDistribution`` allows to sample from it,
+computes the entropy, the log probability (``log_prob``) and backpropagate the gradient.
+
+In the case of continuous actions, a Gaussian distribution is used. The policy network outputs
+mean and (log) std of the distribution (assumed to be a ``DiagGaussianDistribution``).
+
+.. automodule:: stable_baselines3.common.distributions
+  :members:
--- a/docs/common/env_checker.rst
+++ b/docs/common/env_checker.rst
@ -0,0 +1,7 @@
+.. _env_checker:
+
+Gym Environment Checker
+========================
+
+.. automodule:: stable_baselines3.common.env_checker
+  :members:
--- a/docs/common/evaluation.rst
+++ b/docs/common/evaluation.rst
@ -0,0 +1,7 @@
+.. _eval:
+
+Evaluation Helper
+=================
+
+.. automodule:: stable_baselines3.common.evaluation
+  :members:
--- a/docs/conf.py
+++ b/docs/conf.py
@ -216,5 +216,5 @@ texinfo_documents = [
 # }

 # kornia's hack to get rtd builder to install latest pytorch
-# if 'READTHEDOCS' in os.environ:
-#     os.system('pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html')
+if on_rtd:
+    os.system('pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html')
--- a/docs/guide/callbacks.rst
+++ b/docs/guide/callbacks.rst
@ -0,0 +1,296 @@
+.. _callbacks:
+
+Callbacks
+=========
+
+A callback is a set of functions that will be called at given stages of the training procedure.
+You can use callbacks to access internal state of the RL model during training.
+It allows one to do monitoring, auto saving, model manipulation, progress bars, ...
+
+
+Custom Callback
+---------------
+
+To build a custom callback, you need to create a class that derives from ``BaseCallback``.
+This will give you access to events (``_on_training_start``, ``_on_step``) and useful variables (like `self.model` for the RL model).
+
+
+.. You can find two examples of custom callbacks in the documentation: one for saving the best model according to the training reward (see :ref:`Examples <examples>`), and one for logging additional values with Tensorboard (see :ref:`Tensorboard section <tensorboard>`).
+
+
+.. code-block:: python
+
+    from stable_baselines3.common.callbacks import BaseCallback
+
+
+    class CustomCallback(BaseCallback):
+        """
+        A custom callback that derives from ``BaseCallback``.
+
+        :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
+        """
+        def __init__(self, verbose=0):
+            super(CustomCallback, self).__init__(verbose)
+            # Those variables will be accessible in the callback
+            # (they are defined in the base class)
+            # The RL model
+            # self.model = None  # type: BaseRLModel
+            # An alias for self.model.get_env(), the environment used for training
+            # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
+            # Number of time the callback was called
+            # self.n_calls = 0  # type: int
+            # self.num_timesteps = 0  # type: int
+            # local and global variables
+            # self.locals = None  # type: Dict[str, Any]
+            # self.globals = None  # type: Dict[str, Any]
+            # The logger object, used to report things in the terminal
+            # self.logger = None  # type: logger.Logger
+            # # Sometimes, for event callback, it is useful
+            # # to have access to the parent object
+            # self.parent = None  # type: Optional[BaseCallback]
+
+        def _on_training_start(self) -> None:
+            """
+            This method is called before the first rollout starts.
+            """
+            pass
+
+        def _on_rollout_start(self) -> None:
+            """
+            A rollout is the collection of environment interaction
+            using the current policy.
+            This event is triggered before collecting new samples.
+            """
+            pass
+
+        def _on_step(self) -> bool:
+            """
+            This method will be called by the model after each call to `env.step()`.
+
+            For child callback (of an `EventCallback`), this will be called
+            when the event is triggered.
+
+            :return: (bool) If the callback returns False, training is aborted early.
+            """
+            return True
+
+        def _on_rollout_end(self) -> None:
+            """
+            This event is triggered before updating the policy.
+            """
+            pass
+
+        def _on_training_end(self) -> None:
+            """
+            This event is triggered before exiting the `learn()` method.
+            """
+            pass
+
+
+.. note::
+  ``self.num_timesteps`` corresponds to the total number of steps taken in the environment, i.e., it is the number of environments multiplied by the number of time ``env.step()`` was called
+
+  For the other algorithms, ``self.num_timesteps`` is incremented by ``n_envs`` (number of environments) after each call to ``env.step()``
+
+
+.. note::
+
+  For off-policy algorithms like SAC, DDPG, TD3 or DQN, the notion of ``rollout`` corresponds to the steps taken in the environment between two updates.
+
+
+.. _EventCallback:
+
+Event Callback
+--------------
+
+Compared to Keras, Stable Baselines provides a second type of ``BaseCallback``, named ``EventCallback`` that is meant to trigger events. When an event is triggered, then a child callback is called.
+
+As an example, :ref:`EvalCallback` is an ``EventCallback`` that will trigger its child callback when there is a new best model.
+A child callback is for instance :ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>` that stops the training if the mean reward achieved by the RL model is above a threshold.
+
+.. note::
+
+	We recommend to take a look at the source code of :ref:`EvalCallback` and :ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>` to have a better overview of what can be achieved with this kind of callbacks.
+
+
+.. code-block:: python
+
+    class EventCallback(BaseCallback):
+        """
+        Base class for triggering callback on event.
+
+        :param callback: (Optional[BaseCallback]) Callback that will be called
+            when an event is triggered.
+        :param verbose: (int)
+        """
+        def __init__(self, callback: Optional[BaseCallback] = None, verbose: int = 0):
+            super(EventCallback, self).__init__(verbose=verbose)
+            self.callback = callback
+            # Give access to the parent
+            if callback is not None:
+                self.callback.parent = self
+        ...
+
+        def _on_event(self) -> bool:
+            if self.callback is not None:
+                return self.callback()
+            return True
+
+
+
+Callback Collection
+-------------------
+
+Stable Baselines provides you with a set of common callbacks for:
+
+- saving the model periodically (:ref:`CheckpointCallback`)
+- evaluating the model periodically and saving the best one (:ref:`EvalCallback`)
+- chaining callbacks (:ref:`CallbackList`)
+- triggering callback on events (:ref:`EventCallback`, :ref:`EveryNTimesteps`)
+- stopping the training early based on a reward threshold (:ref:`StopTrainingOnRewardThreshold <StopTrainingCallback>`)
+
+
+.. _CheckpointCallback:
+
+CheckpointCallback
+^^^^^^^^^^^^^^^^^^
+
+Callback for saving a model every ``save_freq`` steps, you must specify a log folder (``save_path``)
+and optionally a prefix for the checkpoints (``rl_model`` by default).
+
+
+.. code-block:: python
+
+    from stable_baselines3 import SAC
+    from stable_baselines3.common.callbacks import CheckpointCallback
+    # Save a checkpoint every 1000 steps
+    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
+                                             name_prefix='rl_model')
+
+    model = SAC('MlpPolicy', 'Pendulum-v0')
+    model.learn(2000, callback=checkpoint_callback)
+
+
+.. _EvalCallback:
+
+EvalCallback
+^^^^^^^^^^^^
+
+Evaluate periodically the performance of an agent, using a separate test environment.
+It will save the best model if ``best_model_save_path`` folder is specified and save the evaluations results in a numpy archive (`evaluations.npz`) if ``log_path`` folder is specified.
+
+
+.. note::
+
+	You can pass a child callback via the ``callback_on_new_best`` argument. It will be triggered each time there is a new best model.
+
+
+
+.. code-block:: python
+
+    import gym
+
+    from stable_baselines3 import SAC
+    from stable_baselines3.common.callbacks import EvalCallback
+
+    # Separate evaluation env
+    eval_env = gym.make('Pendulum-v0')
+    # Use deterministic actions for evaluation
+    eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/',
+                                 log_path='./logs/', eval_freq=500,
+                                 deterministic=True, render=False)
+
+    model = SAC('MlpPolicy', 'Pendulum-v0')
+    model.learn(5000, callback=eval_callback)
+
+
+.. _Callbacklist:
+
+CallbackList
+^^^^^^^^^^^^
+
+Class for chaining callbacks, they will be called sequentially.
+Alternatively, you can pass directly a list of callbacks to the `learn()` method, it will be converted automatically to a ``CallbackList``.
+
+
+.. code-block:: python
+
+    import gym
+
+    from stable_baselines3 import SAC
+    from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
+
+    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/')
+    # Separate evaluation env
+    eval_env = gym.make('Pendulum-v0')
+    eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model',
+                                 log_path='./logs/results', eval_freq=500)
+    # Create the callback list
+    callback = CallbackList([checkpoint_callback, eval_callback])
+
+    model = SAC('MlpPolicy', 'Pendulum-v0')
+    # Equivalent to:
+    # model.learn(5000, callback=[checkpoint_callback, eval_callback])
+    model.learn(5000, callback=callback)
+
+
+.. _StopTrainingCallback:
+
+StopTrainingOnRewardThreshold
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Stop the training once a threshold in episodic reward (mean episode reward over the evaluations) has been reached (i.e., when the model is good enough).
+It must be used with the :ref:`EvalCallback` and use the event triggered by a new best model.
+
+
+.. code-block:: python
+
+    import gym
+
+    from stable_baselines3 import SAC
+    from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
+
+    # Separate evaluation env
+    eval_env = gym.make('Pendulum-v0')
+    # Stop training when the model reaches the reward threshold
+    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1)
+    eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1)
+
+    model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1)
+    # Almost infinite number of timesteps, but the training will stop
+    # early as soon as the reward threshold is reached
+    model.learn(int(1e10), callback=eval_callback)
+
+
+.. _EveryNTimesteps:
+
+EveryNTimesteps
+^^^^^^^^^^^^^^^
+
+An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` timesteps.
+
+
+.. note::
+
+	Because of the way ``PPO1`` and ``TRPO`` work (they rely on MPI), ``n_steps`` is a lower bound between two events.
+
+
+.. code-block:: python
+
+  import gym
+
+  from stable_baselines3 import PPO
+  from stable_baselines3.common.callbacks import CheckpointCallback, EveryNTimesteps
+
+  # this is equivalent to defining CheckpointCallback(save_freq=500)
+  # checkpoint_callback will be triggered every 500 steps
+  checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/')
+  event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)
+
+  model = PPO('MlpPolicy', 'Pendulum-v0', verbose=1)
+
+  model.learn(int(2e4), callback=event_callback)
+
+
+.. automodule:: stable_baselines3.common.callbacks
+  :members:
--- a/docs/guide/checking_nan.rst
+++ b/docs/guide/checking_nan.rst
@ -0,0 +1,164 @@
+Dealing with NaNs and infs
+==========================
+
+During the training of a model on a given environment, it is possible that the RL model becomes completely
+corrupted when a NaN or an inf is given or returned from the RL model.
+
+How and why?
+------------
+
+The issue arises then NaNs or infs do not crash, but simply get propagated through the training,
+until all the floating point number converge to NaN or inf. This is in line with the
+`IEEE Standard for Floating-Point Arithmetic (IEEE 754) <https://ieeexplore.ieee.org/document/4610935>`_ standard, as it says:
+
+.. note::
+    Five possible exceptions can occur:
+        - Invalid operation (:math:`\sqrt{-1}`, :math:`\inf \times 1`, :math:`\text{NaN}\ \mathrm{mod}\ 1`, ...) return NaN
+        - Division by zero:
+            - if the operand is not zero (:math:`1/0`, :math:`-2/0`, ...) returns :math:`\pm\inf`
+            - if the operand is zero (:math:`0/0`) returns signaling NaN
+        - Overflow (exponent too high to represent) returns :math:`\pm\inf`
+        - Underflow (exponent too low to represent) returns :math:`0`
+        - Inexact (not representable exactly in base 2, eg: :math:`1/5`) returns the rounded value (ex: :code:`assert (1/5) * 3 == 0.6000000000000001`)
+
+And of these, only ``Division by zero`` will signal an exception, the rest will propagate invalid values quietly.
+
+In python, dividing by zero will indeed raise the exception: ``ZeroDivisionError: float division by zero``,
+but ignores the rest.
+
+The default in numpy, will warn: ``RuntimeWarning: invalid value encountered``
+but will not halt the code.
+
+
+Anomaly detection with PyTorch
+------------------------------
+
+To enable NaN detection in PyTorch you can do
+
+.. code-block:: python
+
+  import torch as th
+  th.autograd.set_detect_anomaly(True)
+
+
+Numpy parameters
+----------------
+
+Numpy has a convenient way of dealing with invalid value: `numpy.seterr <https://docs.scipy.org/doc/numpy/reference/generated/numpy.seterr.html>`_,
+which defines for the python process, how it should handle floating point error.
+
+.. code-block:: python
+
+  import numpy as np
+
+  np.seterr(all='raise')  # define before your code.
+
+  print("numpy test:")
+
+  a = np.float64(1.0)
+  b = np.float64(0.0)
+  val = a / b  # this will now raise an exception instead of a warning.
+  print(val)
+
+but this will also avoid overflow issues on floating point numbers:
+
+.. code-block:: python
+
+  import numpy as np
+
+  np.seterr(all='raise')  # define before your code.
+
+  print("numpy overflow test:")
+
+  a = np.float64(10)
+  b = np.float64(1000)
+  val = a ** b  # this will now raise an exception
+  print(val)
+
+but will not avoid the propagation issues:
+
+.. code-block:: python
+
+  import numpy as np
+
+  np.seterr(all='raise')  # define before your code.
+
+  print("numpy propagation test:")
+
+  a = np.float64('NaN')
+  b = np.float64(1.0)
+  val = a + b  # this will neither warn nor raise anything
+  print(val)
+
+
+VecCheckNan Wrapper
+-------------------
+
+In order to find when and from where the invalid value originated from, stable-baselines3 comes with a ``VecCheckNan`` wrapper.
+
+It will monitor the actions, observations, and rewards, indicating what action or observation caused it and from what.
+
+.. code-block:: python
+
+  import gym
+  from gym import spaces
+  import numpy as np
+
+  from stable_baselines3 import PPO
+  from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan
+
+  class NanAndInfEnv(gym.Env):
+      """Custom Environment that raised NaNs and Infs"""
+      metadata = {'render.modes': ['human']}
+
+      def __init__(self):
+          super(NanAndInfEnv, self).__init__()
+          self.action_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64)
+          self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64)
+
+      def step(self, _action):
+          randf = np.random.rand()
+          if randf > 0.99:
+              obs = float('NaN')
+          elif randf > 0.98:
+              obs = float('inf')
+          else:
+              obs = randf
+          return [obs], 0.0, False, {}
+
+      def reset(self):
+          return [0.0]
+
+      def render(self, mode='human', close=False):
+          pass
+
+  # Create environment
+  env = DummyVecEnv([lambda: NanAndInfEnv()])
+  env = VecCheckNan(env, raise_exception=True)
+
+  # Instantiate the agent
+  model = PPO('MlpPolicy', env)
+
+  # Train the agent
+  model.learn(total_timesteps=int(2e5))  # this will crash explaining that the invalid value originated from the environment.
+
+RL Model hyperparameters
+------------------------
+
+Depending on your hyperparameters, NaN can occurs much more often.
+A great example of this: https://github.com/hill-a/stable-baselines/issues/340
+
+Be aware, the hyperparameters given by default seem to work in most cases,
+however your environment might not play nice with them.
+If this is the case, try to read up on the effect each hyperparameters has on the model,
+so that you can try and tune them to get a stable model. Alternatively, you can try automatic hyperparameter tuning (included in the rl zoo).
+
+Missing values from datasets
+----------------------------
+
+If your environment is generated from an external dataset, do not forget to make sure your dataset does not contain NaNs.
+As some datasets will sometimes fill missing values with NaNs as a surrogate value.
+
+Here is some reading material about finding NaNs: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
+
+And filling the missing values with something else (imputation): https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4
--- a/docs/guide/custom_env.rst
+++ b/docs/guide/custom_env.rst
@ -0,0 +1,82 @@
+.. _custom_env:
+
+Using Custom Environments
+==========================
+
+To use the rl baselines with custom environments, they just need to follow the *gym* interface.
+That is to say, your environment must implement the following methods (and inherits from OpenAI Gym Class):
+
+
+.. note::
+	If you are using images as input, the input values must be in [0, 255] as the observation
+	is normalized (dividing by 255 to have values in [0, 1]) when using CNN policies.
+
+
+
+.. code-block:: python
+
+  import gym
+  from gym import spaces
+
+  class CustomEnv(gym.Env):
+    """Custom Environment that follows gym interface"""
+    metadata = {'render.modes': ['human']}
+
+    def __init__(self, arg1, arg2, ...):
+      super(CustomEnv, self).__init__()
+      # Define action and observation space
+      # They must be gym.spaces objects
+      # Example when using discrete actions:
+      self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
+      # Example for using image as input:
+      self.observation_space = spaces.Box(low=0, high=255,
+                                          shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)
+
+    def step(self, action):
+      ...
+      return observation, reward, done, info
+    def reset(self):
+      ...
+      return observation  # reward, done, info can't be included
+    def render(self, mode='human'):
+      ...
+    def close (self):
+      ...
+
+
+Then you can define and train a RL agent with:
+
+.. code-block:: python
+
+  # Instantiate the env
+  env = CustomEnv(arg1, ...)
+  # Define and Train the agent
+  model = A2C('CnnPolicy', env).learn(total_timesteps=1000)
+
+
+To check that your environment follows the gym interface, please use:
+
+.. code-block:: python
+
+	from stable_baselines3.common.env_checker import check_env
+
+	env = CustomEnv(arg1, ...)
+	# It will check your custom environment and output additional warnings if needed
+	check_env(env)
+
+
+
+We have created a `colab notebook <https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb>`_ for
+a concrete example of creating a custom environment.
+
+You can also find a `complete guide online <https://github.com/openai/gym/blob/master/docs/creating-environments.md>`_
+on creating a custom Gym environment.
+
+
+Optionally, you can also register the environment with gym,
+that will allow you to create the RL agent in one line (and use ``gym.make()`` to instantiate the env).
+
+
+In the project, for testing purposes, we use a custom environment named ``IdentityEnv``
+defined `in this file <https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/identity_env.py>`_.
+An example of how to use it can be found `here <https://github.com/hill-a/stable-baselines/blob/master/tests/test_identity.py>`_.
--- a/docs/guide/install.rst
+++ b/docs/guide/install.rst
@ -0,0 +1,152 @@
+.. _install:
+
+Installation
+============
+
+Prerequisites
+-------------
+
+Stable-Baselines3 requires python 3.6+.
+
+Windows 10
+~~~~~~~~~~
+
+We recommend using `Anaconda <https://conda.io/docs/user-guide/install/windows.html>`_ for Windows users for easier installation of Python packages and required libraries. You need an environment with Python version 3.6 or above.
+
+For a quick start you can move straight to installing Stable-Baselines3 in the next step.
+
+.. note::
+
+	Trying to create Atari environments may result to vague errors related to missing DLL files and modules. This is an
+	issue with atari-py package. `See this discussion for more information <https://github.com/openai/atari-py/issues/65>`_.
+
+
+Stable Release
+~~~~~~~~~~~~~~
+To install Stable Baselines3 with pip, execute:
+
+.. code-block:: bash
+
+    pip install stable-baselines3[extra]
+
+This includes an optional dependency OpenCV to display the environments when using ``SubprocVecEnv``. If you do not need it, you can install without OpenCV:
+
+
+.. code-block:: bash
+
+    pip install stable-baselines3
+
+
+Bleeding-edge version
+---------------------
+
+.. code-block:: bash
+
+	pip install git+https://github.com/DLR-RM/stable-baselines3
+
+
+Development verion
+------------------
+
+To contribute to Stable-Baselines3, with support for running tests and building the documentation.
+
+.. code-block:: bash
+
+    git clone https://github.com/DLR-RM/stable-baselines3 && cd stable-baselines3
+    pip install -e .[docs,tests,extra]
+
+
+.. Using Docker Images
+.. -------------------
+..
+.. If you are looking for docker images with stable-baselines already installed in it,
+.. we recommend using images from `RL Baselines3 Zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_.
+..
+.. Otherwise, the following images contained all the dependencies for stable-baselines3 but not the stable-baselines3 package itself.
+.. They are made for development.
+..
+.. Use Built Images
+.. ~~~~~~~~~~~~~~~~
+..
+.. GPU image (requires `nvidia-docker`_):
+..
+.. .. code-block:: bash
+..
+..    docker pull stablebaselines/stable-baselines3
+..
+.. CPU only:
+..
+.. .. code-block:: bash
+..
+..    docker pull stablebaselines/stable-baselines3-cpu
+..
+.. Build the Docker Images
+.. ~~~~~~~~~~~~~~~~~~~~~~~~
+..
+.. Build GPU image (with nvidia-docker):
+..
+.. .. code-block:: bash
+..
+..    make docker-gpu
+..
+.. Build CPU image:
+..
+.. .. code-block:: bash
+..
+..    make docker-cpu
+..
+.. Note: if you are using a proxy, you need to pass extra params during
+.. build and do some `tweaks`_:
+..
+.. .. code-block:: bash
+..
+..    --network=host --build-arg HTTP_PROXY=http://your.proxy.fr:8080/ --build-arg http_proxy=http://your.proxy.fr:8080/ --build-arg HTTPS_PROXY=https://your.proxy.fr:8080/ --build-arg https_proxy=https://your.proxy.fr:8080/
+..
+.. Run the images (CPU/GPU)
+.. ~~~~~~~~~~~~~~~~~~~~~~~~
+..
+.. Run the nvidia-docker GPU image
+..
+.. .. code-block:: bash
+..
+..    docker run -it --runtime=nvidia --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines bash -c 'cd /root/code/stable-baselines/ && pytest tests/'
+..
+.. Or, with the shell file:
+..
+.. .. code-block:: bash
+..
+..    ./scripts/run_docker_gpu.sh pytest tests/
+..
+.. Run the docker CPU image
+..
+.. .. code-block:: bash
+..
+..    docker run -it --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines-cpu bash -c 'cd /root/code/stable-baselines/ && pytest tests/'
+..
+.. Or, with the shell file:
+..
+.. .. code-block:: bash
+..
+..    ./scripts/run_docker_cpu.sh pytest tests/
+..
+.. Explanation of the docker command:
+..
+.. -  ``docker run -it`` create an instance of an image (=container), and
+..    run it interactively (so ctrl+c will work)
+.. -  ``--rm`` option means to remove the container once it exits/stops
+..    (otherwise, you will have to use ``docker rm``)
+.. -  ``--network host`` don't use network isolation, this allow to use
+..    tensorboard/visdom on host machine
+.. -  ``--ipc=host`` Use the host system’s IPC namespace. IPC (POSIX/SysV IPC) namespace provides
+..    separation of named shared memory segments, semaphores and message
+..    queues.
+.. -  ``--name test`` give explicitly the name ``test`` to the container,
+..    otherwise it will be assigned a random name
+.. -  ``--mount src=...`` give access of the local directory (``pwd``
+..    command) to the container (it will be map to ``/root/code/stable-baselines``), so
+..    all the logs created in the container in this folder will be kept
+.. -  ``bash -c '...'`` Run command inside the docker image, here run the tests
+..    (``pytest tests/``)
+..
+.. .. _nvidia-docker: https://github.com/NVIDIA/nvidia-docker
+.. .. _tweaks: https://stackoverflow.com/questions/23111631/cannot-download-docker-images-behind-a-proxy
--- a/docs/guide/migration.rst
+++ b/docs/guide/migration.rst
@ -0,0 +1,12 @@
+.. _migration:
+
+================================
+Migrating from Stable-Baselines
+================================
+
+
+This is a guide to migrate from Stable-Baselines to Stable-Baselines3.
+
+It also references the main changes.
+
+**TODO**
--- a/docs/guide/quickstart.rst
+++ b/docs/guide/quickstart.rst
@ -6,26 +6,27 @@ Getting Started

 Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms.

-Here is a quick example of how to train and run SAC on a Pendulum environment:
+Here is a quick example of how to train and run A2C on a CartPole environment:

 .. code-block:: python

  import gym

-  from stable_baselines3.sac.policies import MlpPolicy
-  from stable_baselines3.common.vec_env import DummyVecEnv
-  from stable_baselines3 import SAC
+  from stable_baselines3 import A2C
+  from stable_baselines3.a2c import MlpPolicy

-  env = gym.make('Pendulum-v0')
+  env = gym.make('CartPole-v1')

-  model = SAC(MlpPolicy, env, verbose=1)
+  model = A2C(MlpPolicy, env, verbose=1)
  model.learn(total_timesteps=10000)

  obs = env.reset()
  for i in range(1000):
-      action = model.predict(obs)
-      obs, rewards, dones, info = env.step(action)
+      action, _state = model.predict(obs, deterministic=True)
+      obs, reward, done, info = env.step(action)
      env.render()
+      if done:
+        obs = env.reset()


 Or just train a model with a one liner if
@ -34,6 +35,6 @@ the policy is registered:

 .. code-block:: python

-    from stable_baselines3 import SAC
+    from stable_baselines3 import A2C

-    model = SAC('MlpPolicy', 'Pendulum-v0').learn(10000)
+    model = A2C('MlpPolicy', 'CartPole-v1').learn(10000)
--- a/docs/guide/rl.rst
+++ b/docs/guide/rl.rst
@ -0,0 +1,17 @@
+.. _rl:
+
+================================
+Reinforcement Learning Resources
+================================
+
+
+Stable-Baselines3 assumes that you already understand the basic concepts of Reinforcement Learning (RL).
+
+However, if you want to learn about RL, there are several good resources to get started:
+
+- `OpenAI Spinning Up <https://spinningup.openai.com/en/latest/>`_
+- `David Silver's course <http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html>`_
+- `Lilian Weng's blog <https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html>`_
+- `Berkeley's Deep RL Bootcamp <https://sites.google.com/view/deep-rl-bootcamp/lectures>`_
+- `Berkeley's Deep Reinforcement Learning course <http://rail.eecs.berkeley.edu/deeprlcourse/>`_
+- `More resources <https://github.com/dennybritz/reinforcement-learning>`_
--- a/docs/guide/rl_tips.rst
+++ b/docs/guide/rl_tips.rst
@ -0,0 +1,251 @@
+.. _rl_tips:
+
+======================================
+Reinforcement Learning Tips and Tricks
+======================================
+
+The aim of this section is to help you doing reinforcement learning experiments.
+It covers general advice about RL (where to start, which algorithm to choose, how to evaluate an algorithm, ...),
+as well as tips and tricks when using a custom environment or implementing an RL algorithm.
+
+
+General advice when using Reinforcement Learning
+================================================
+
+TL;DR
+-----
+
+1. Read about RL and Stable Baselines
+2. Do quantitative experiments and hyperparameter tuning if needed
+3. Evaluate the performance using a separate test environment
+4. For better performance, increase the training budget
+
+
+Like any other subject, if you want to work with RL, you should first read about it (we have a dedicated `resource page <rl.html>`_ to get you started)
+to understand what you are using. We also recommend you read Stable Baselines (SB) documentation and do the `tutorial <https://github.com/araffin/rl-tutorial-jnrr19>`_.
+It covers basic usage and guide you towards more advanced concepts of the library (e.g. callbacks and wrappers).
+
+Reinforcement Learning differs from other machine learning methods in several ways. The data used to train the agent is collected
+through interactions with the environment by the agent itself (compared to supervised learning where you have a fixed dataset for instance).
+This dependence can lead to vicious circle: if the agent collects poor quality data (e.g., trajectories with no rewards), then it will not improve and continue to amass
+bad trajectories.
+
+This factor, among others, explains that results in RL may vary from one run to another (i.e., when only the seed of the pseudo-random generator changes).
+For this reason, you should always do several runs to have quantitative results.
+
+Good results in RL are generally dependent on finding appropriate hyperparameters. Recent algorithms (PPO, SAC, TD3) normally require little hyperparameter tuning,
+however, *don't expect the default ones to work* on any environment.
+
+Therefore, we *highly recommend you* to take a look at the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_ (or the original papers) for tuned hyperparameters.
+A best practice when you apply RL to a new problem is to do automatic hyperparameter optimization. Again, this is included in the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_.
+
+When applying RL to a custom problem, you should always normalize the input to the agent (e.g. using VecNormalize for PPO2/A2C)
+and look at common preprocessing done on other environments (e.g. for `Atari <https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/>`_, frame-stack, ...).
+Please refer to *Tips and Tricks when creating a custom environment* paragraph below for more advice related to custom environments.
+
+
+Current Limitations of RL
+-------------------------
+
+You have to be aware of the current `limitations <https://www.alexirpan.com/2018/02/14/rl-hard.html>`_ of reinforcement learning.
+
+
+Model-free RL algorithms (i.e. all the algorithms implemented in SB) are usually *sample inefficient*. They require a lot of samples (sometimes millions of interactions) to learn something useful.
+That's why most of the successes in RL were achieved on games or in simulation only. For instance, in this `work <https://www.youtube.com/watch?v=aTDkYFZFWug>`_ by ETH Zurich, the ANYmal robot was trained in simulation only, and then tested in the real world.
+
+As a general advice, to obtain better performances, you should augment the budget of the agent (number of training timesteps).
+
+
+In order to achieve the desired behavior, expert knowledge is often required to design an adequate reward function.
+This *reward engineering* (or *RewArt* as coined by `Freek Stulp <http://www.freekstulp.net/>`_), necessitates several iterations. As a good example of reward shaping,
+you can take a look at `Deep Mimic paper <https://xbpeng.github.io/projects/DeepMimic/index.html>`_ which combines imitation learning and reinforcement learning to do acrobatic moves.
+
+One last limitation of RL is the instability of training. That is to say, you can observe during training a huge drop in performance.
+This behavior is particularly present in ``DDPG``, that's why its extension ``TD3`` tries to tackle that issue.
+Other method, like ``TRPO`` or ``PPO`` make use of a *trust region* to minimize that problem by avoiding too large update.
+
+
+How to evaluate an RL algorithm?
+--------------------------------
+
+Because most algorithms use exploration noise during training, you need a separate test environment to evaluate the performance
+of your agent at a given time. It is recommended to periodically evaluate your agent for ``n`` test episodes (``n`` is usually between 5 and 20)
+and average the reward per episode to have a good estimate.
+
+As some policy are stochastic by default (e.g. A2C or PPO), you should also try to set `deterministic=True` when calling the `.predict()` method,
+this frequently leads to better performance.
+Looking at the training curve (episode reward function of the timesteps) is a good proxy but underestimates the agent true performance.
+
+
+.. note::
+
+	We provide an ``EvalCallback`` for doing such evaluation. You can read more about it in the :ref:`Callbacks <callbacks>` section.
+
+
+
+We suggest you reading `Deep Reinforcement Learning that Matters <https://arxiv.org/abs/1709.06560>`_ for a good discussion about RL evaluation.
+
+You can also take a look at this `blog post <https://openlab-flowers.inria.fr/t/how-many-random-seeds-should-i-use-statistical-power-analysis-in-deep-reinforcement-learning-experiments/457>`_
+and this `issue <https://github.com/hill-a/stable-baselines/issues/199>`_ by Cédric Colas.
+
+
+Which algorithm should I use?
+=============================
+
+There is no silver bullet in RL, depending on your needs and problem, you may choose one or the other.
+The first distinction comes from your action space, i.e., do you have discrete (e.g. LEFT, RIGHT, ...)
+or continuous actions (ex: go to a certain speed)?
+
+Some algorithms are only tailored for one or the other domain: ``DQN`` only supports discrete actions, where ``SAC`` is restricted to continuous actions.
+
+The second difference that will help you choose is whether you can parallelize your training or not.
+If what matters is the wall clock training time, then you should lean towards ``A2C`` and its derivatives (PPO, ...).
+Take a look at the `Vectorized Environments <vec_envs.html>`_ to learn more about training with multiple workers.
+
+To sum it up:
+
+Discrete Actions
+----------------
+
+.. note::
+
+	This covers ``Discrete``, ``MultiDiscrete``, ``Binary`` and ``MultiBinary`` spaces
+
+
+Discrete Actions - Single Process
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+DQN with extensions (double DQN, prioritized replay, ...) are the recommended algorithms.
+DQN is usually slower to train (regarding wall clock time) but is the most sample efficient (because of its replay buffer).
+
+Discrete Actions - Multiprocessed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You should give a try to PPO or A2C.
+
+
+Continuous Actions
+------------------
+
+Continuous Actions - Single Process
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Current State Of The Art (SOTA) algorithms are ``SAC`` and ``TD3``.
+Please use the hyperparameters in the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_ for best results.
+
+
+Continuous Actions - Multiprocessed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Take a look at PPO2, TRPO or A2C. Again, don't forget to take the hyperparameters from the `RL zoo <https://github.com/DLR-RM/rl-baselines3-zoo>`_
+for continuous actions problems (cf *Bullet* envs).
+
+.. note::
+
+  Normalization is critical for those algorithms
+
+
+
+.. Goal Environment
+.. -----------------
+..
+.. If your environment follows the ``GoalEnv`` interface (cf `HER <../modules/her.html>`_), then you should use
+.. HER + (SAC/TD3/DDPG/DQN) depending on the action space.
+..
+..
+.. .. note::
+..
+.. 	The number of workers is an important hyperparameters for experiments with HER
+..
+
+
+Tips and Tricks when creating a custom environment
+==================================================
+
+If you want to learn about how to create a custom environment, we recommend you read this `page <custom_env.html>`_.
+We also provide a `colab notebook <https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb>`_ for
+a concrete example of creating a custom gym environment.
+
+Some basic advice:
+
+- always normalize your observation space when you can, i.e., when you know the boundaries
+- normalize your action space and make it symmetric when continuous (cf potential issue below) A good practice is to rescale your actions to lie in [-1, 1]. This does not limit you as you can easily rescale the action inside the environment
+- start with shaped reward (i.e. informative reward) and simplified version of your problem
+- debug with random actions to check that your environment works and follows the gym interface:
+
+
+We provide a helper to check that your environment runs without error:
+
+.. code-block:: python
+
+	from stable_baselines3.common.env_checker import check_env
+
+	env = CustomEnv(arg1, ...)
+	# It will check your custom environment and output additional warnings if needed
+	check_env(env)
+
+
+If you want to quickly try a random agent on your environment, you can also do:
+
+.. code-block:: python
+
+	env = YourEnv()
+	obs = env.reset()
+	n_steps = 10
+	for _ in range(n_steps):
+	    # Random action
+	    action = env.action_space.sample()
+	    obs, reward, done, info = env.step(action)
+			if done:
+				obs = env.reset()
+
+
+**Why should I normalize the action space?**
+
+
+Most reinforcement learning algorithms rely on a Gaussian distribution (initially centered at 0 with std 1) for continuous actions.
+So, if you forget to normalize the action space when using a custom environment,
+this can harm learning and be difficult to debug (cf attached image and `issue #473 <https://github.com/hill-a/stable-baselines/issues/473>`_).
+
+.. figure:: ../_static/img/mistake.png
+
+
+Another consequence of using a Gaussian is that the action range is not bounded.
+That's why clipping is usually used as a bandage to stay in a valid interval.
+A better solution would be to use a squashing function (cf ``SAC``) or a Beta distribution (cf `issue #112 <https://github.com/hill-a/stable-baselines/issues/112>`_).
+
+.. note::
+
+	This statement is not true for ``DDPG`` or ``TD3`` because they don't rely on any probability distribution.
+
+
+
+Tips and Tricks when implementing an RL algorithm
+=================================================
+
+When you try to reproduce a RL paper by implementing the algorithm, the `nuts and bolts of RL research <http://joschu.net/docs/nuts-and-bolts.pdf>`_
+by John Schulman are quite useful (`video <https://www.youtube.com/watch?v=8EcdaCk9KaQ>`_).
+
+We *recommend following those steps to have a working RL algorithm*:
+
+1. Read the original paper several times
+2. Read existing implementations (if available)
+3. Try to have some "sign of life" on toy problems
+4. Validate the implementation by making it run on harder and harder envs (you can compare results against the RL zoo)
+	You usually need to run hyperparameter optimization for that step.
+
+You need to be particularly careful on the shape of the different objects you are manipulating (a broadcast mistake will fail silently cf `issue #75 <https://github.com/hill-a/stable-baselines/pull/76>`_)
+and when to stop the gradient propagation.
+
+A personal pick (by @araffin) for environments with gradual difficulty in RL with continuous actions:
+
+1. Pendulum (easy to solve)
+2. HalfCheetahBullet (medium difficulty with local minima and shaped reward)
+3. BipedalWalkerHardcore (if it works on that one, then you can have a cookie)
+
+in RL with discrete actions:
+
+1. CartPole-v1 (easy to be better than random agent, harder to achieve maximal performance)
+2. LunarLander
+3. Pong (one of the easiest Atari game)
+4. other Atari games (e.g. Breakout)
--- a/docs/guide/vec_envs.rst
+++ b/docs/guide/vec_envs.rst
@ -6,11 +6,11 @@ Vectorized Environments
 =======================

 Vectorized Environments are a method for stacking multiple independent environments into a single environment.
-Instead of training an RL agent on 1 environment per step, it allows us to train it on `n` environments per step.
-Because of this, `actions` passed to the environment are now a vector (of dimension `n`).
-It is the same for `observations`, `rewards` and end of episode signals (`dones`).
-In the case of non-array observation spaces such as `Dict` or `Tuple`, where different sub-spaces
-may have different shapes, the sub-observations are vectors (of dimension `n`).
+Instead of training an RL agent on 1 environment per step, it allows us to train it on ``n`` environments per step.
+Because of this, ``actions`` passed to the environment are now a vector (of dimension ``n``).
+It is the same for ``observations``, ``rewards`` and end of episode signals (``dones``).
+In the case of non-array observation spaces such as ``Dict`` or ``Tuple``, where different sub-spaces
+may have different shapes, the sub-observations are vectors (of dimension ``n``).

 ============= ======= ============ ======== ========= ================
 Name          ``Box`` ``Discrete`` ``Dict`` ``Tuple`` Multi Processing
@ -27,7 +27,7 @@ SubprocVecEnv ✔️       ✔️           ✔️        ✔️         ✔️

 	When using vectorized environments, the environments are automatically reset at the end of each episode.
 	Thus, the observation returned for the i-th environment when ``done[i]`` is true will in fact be the first observation of the next episode, not the last observation of the episode that has just terminated.
-	You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the `VecEnv`.
+	You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the vecenv.

 .. warning::

@ -69,3 +69,24 @@ VecNormalize

 .. autoclass:: VecNormalize
  :members:
+
+
+VecVideoRecorder
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: VecVideoRecorder
+  :members:
+
+
+VecCheckNan
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: VecCheckNan
+  :members:
+
+
+VecTransposeImage
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: VecTransposeImage
+  :members:
--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,4 +1,4 @@
-.. Stable Baselines documentation master file, created by
+.. Stable Baselines3 documentation master file, created by
   sphinx-quickstart on Thu Sep 26 11:06:54 2019.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
@ -6,21 +6,41 @@
 Welcome to Stable Baselines3 docs!
 ==================================

-`Stable Baselines3 <https://github.com/DLR-RM/stable-baselines3>`_ is the next major version (PyTorch edition) of `Stable Baselines <https://github.com/hill-a/stable-baselines>`_,
-a set of improved implementations of reinforcement learning algorithms.
+`Stable Baselines3 <https://github.com/DLR-RM/stable-baselines3>`_ is a set of improved implementations of reinforcement learning algorithms in PyTorch.
+It is the next major version (PyTorch edition) of `Stable Baselines <https://github.com/hill-a/stable-baselines>`_.
+
+
+Github repository: https://github.com/DLR-RM/stable-baselines3

 RL Baselines3 Zoo (collection of pre-trained agents): https://github.com/DLR-RM/rl-baselines3-zoo

 RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and do hyperparameter tuning.


+Main Features
+--------------
+
+- Unified structure for all algorithms
+- PEP8 compliant (unified code style)
+- Documented functions and classes
+- Tests, high code coverage and type hints
+- Clean code
+
+

 .. toctree::
   :maxdepth: 2
   :caption: User Guide

+   guide/install
   guide/quickstart
+   guide/rl_tips
+   guide/rl
   guide/vec_envs
+   guide/custom_env
+   guide/callbacks
+   guide/migration
+   guide/checking_nan


 .. toctree::
@ -33,12 +53,20 @@ RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and d
  modules/sac
  modules/td3

+.. toctree::
+  :maxdepth: 1
+  :caption: Common
+
+  common/distributions
+  common/evaluation
+  common/env_checker

 .. toctree::
  :maxdepth: 1
  :caption: Misc

  misc/changelog
+  misc/projects


 Citing Stable Baselines3
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@ -29,6 +29,7 @@ Others:

 Documentation:
 ^^^^^^^^^^^^^^
+- Added most documentation (adapted from Stable-Baselines)


 Pre-Release 0.5.0 (2020-05-05)
--- a/docs/misc/projects.rst
+++ b/docs/misc/projects.rst
@ -0,0 +1,26 @@
+.. _projects:
+
+Projects
+=========
+
+This is a list of projects using stable-baselines3.
+Please tell us, if you want your project to appear on this page ;)
+
+
+.. RL Racing Robot
+.. --------------------------
+.. Implementation of reinforcement learning approach to make a donkey car learn to race.
+.. Uses SAC on autoencoder features
+..
+.. | Author: Antonin Raffin  (@araffin)
+.. | Github repo: https://github.com/araffin/RL-Racing-Robot
+
+
+.. Generalized State Dependent Exploration for Deep Reinforcement Learning in Robotics
+.. -----------------------------------------------------------------------------------
+..
+.. An exploration method to train RL agent directly on real robots.
+..
+.. | Author: Antonin Raffin, Freek Stulp
+.. | Github: https://github.com/DLR-RM/stable-baselines3/tree/sde
+.. | Paper:
--- a/docs/modules/a2c.rst
+++ b/docs/modules/a2c.rst
@ -38,15 +38,15 @@ MultiBinary   ❌      ❌
 Example
 -------

-Train a A2C agent on `CartPole-v1` using 4 processes.
+Train a A2C agent on ``CartPole-v1`` using 4 environments.

 .. code-block:: python

  import gym

-  from stable_baselines3.common.policies import MlpPolicy
-  from stable_baselines3.common import make_vec_env
  from stable_baselines3 import A2C
+  from stable_baselines3.a2c import MlpPolicy
+  from stable_baselines3.common.cmd_utils import make_vec_env

  # Parallel environments
  env = make_vec_env('CartPole-v1', n_envs=4)
--- a/docs/modules/base.rst
+++ b/docs/modules/base.rst
@ -10,3 +10,12 @@ Common interface for all the RL algorithms

 .. autoclass:: BaseRLModel
  :members:
+
+
+Base RL Class
+=============
+
+The base RL model for Off-Policy algorithm (ex: SAC/TD3)
+
+.. autoclass:: OffPolicyRLModel
+  :members:
--- a/docs/modules/ppo.rst
+++ b/docs/modules/ppo.rst
@ -47,34 +47,32 @@ MultiBinary   ❌      ❌
 Example
 -------

-Train a PPO agent on `Pendulum-v0` using 4 processes.
+Train a PPO agent on ``Pendulum-v0`` using 4 environments.

 .. code-block:: python

-   import gym
+  import gym

-   from stable_baselines3.ppo.policies import MlpPolicy
-   from stable_baselines3.common.vec_env import SubprocVecEnv
-   from stable_baselines3 import PPO
+  from stable_baselines3 import A2C
+  from stable_baselines3.ppo import MlpPolicy
+  from stable_baselines3.common.cmd_utils import make_vec_env

-   # multiprocess environment
-   n_cpu = 4
-   env = SubprocVecEnv([lambda: gym.make('Pendulum-v0') for i in range(n_cpu)])
+  # Parallel environments
+  env = make_vec_env('CartPole-v1', n_envs=4)

-   model = PPO(MlpPolicy, env, verbose=1)
-   model.learn(total_timesteps=25000)
-   model.save("ppo2_cartpole")
+  model = PPO(MlpPolicy, env, verbose=1)
+  model.learn(total_timesteps=25000)
+  model.save("ppo_cartpole")

-   del model # remove to demonstrate saving and loading
+  del model # remove to demonstrate saving and loading

-   model = PPO.load("ppo2_cartpole")
+  model = PPO.load("ppo_cartpole")

-   # Enjoy trained agent
-   obs = env.reset()
-   while True:
-       action, _states = model.predict(obs)
-       obs, rewards, dones, info = env.step(action)
-       env.render()
+  obs = env.reset()
+  while True:
+      action, _states = model.predict(obs)
+      obs, rewards, dones, info = env.step(action)
+      env.render()

 Parameters
 ----------
--- a/docs/modules/sac.rst
+++ b/docs/modules/sac.rst
@ -14,7 +14,7 @@ A key feature of SAC, and a major difference with common RL algorithms, is that

 .. warning::

-  The SAC model does not support ``stable_baselines3.common.policies`` because it uses double q-values
+  The SAC model does not support ``stable_baselines3.ppo.policies`` because it uses double q-values
  and value estimation, as a result it must use its own policy models (see :ref:`sac_policies`).


@ -24,6 +24,7 @@ A key feature of SAC, and a major difference with common RL algorithms, is that
    :nosignatures:

    MlpPolicy
+    CnnPolicy


 Notes
@ -72,15 +73,13 @@ Example
  import gym
  import numpy as np

-  from stable_baselines3.sac.policies import MlpPolicy
-  from stable_baselines3.common.vec_env import DummyVecEnv
  from stable_baselines3 import SAC
+  from stable_baselines3.sac import MlpPolicy

  env = gym.make('Pendulum-v0')
-  env = DummyVecEnv([lambda: env])

  model = SAC(MlpPolicy, env, verbose=1)
-  model.learn(total_timesteps=50000, log_interval=10)
+  model.learn(total_timesteps=10000, log_interval=4)
  model.save("sac_pendulum")

  del model # remove to demonstrate saving and loading
@ -90,8 +89,10 @@ Example
  obs = env.reset()
  while True:
      action, _states = model.predict(obs)
-      obs, rewards, dones, info = env.step(action)
+      obs, reward, done, info = env.step(action)
      env.render()
+      if done:
+        obs = env.reset()

 Parameters
 ----------
@ -108,3 +109,7 @@ SAC Policies
 .. autoclass:: MlpPolicy
  :members:
  :inherited-members:
+
+.. .. autoclass:: CnnPolicy
+..   :members:
+..   :inherited-members:
--- a/docs/modules/td3.rst
+++ b/docs/modules/td3.rst
@ -14,7 +14,7 @@ We recommend reading `OpenAI Spinning guide on TD3 <https://spinningup.openai.co

 .. warning::

-  The TD3 model does not support ``stable_baselines3.common.policies`` because it uses double q-values
+  The TD3 model does not support ``stable_baselines3.ppo.policies`` because it uses double q-values
  estimation, as a result it must use its own policy models (see :ref:`td3_policies`).


@ -73,7 +73,7 @@ Example
  action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

  model = TD3(MlpPolicy, 'Pendulum-v0', action_noise=action_noise, verbose=1)
-  model.learn(total_timesteps=50000, log_interval=10)
+  model.learn(total_timesteps=10000, log_interval=10)
  model.save("td3_pendulum")
  env = model.get_env()

@ -87,6 +87,7 @@ Example
      obs, rewards, dones, info = env.step(action)
      env.render()

+
 Parameters
 ----------

@ -102,3 +103,8 @@ TD3 Policies
 .. autoclass:: MlpPolicy
  :members:
  :inherited-members:
+
+
+.. .. autoclass:: CnnPolicy
+..   :members:
+..   :inherited-members:
--- a/setup.py
+++ b/setup.py
@ -50,8 +50,10 @@ model.learn(total_timesteps=10000)
 obs = env.reset()
 for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
-    obs, rewards, dones, info = env.step(action)
+    obs, reward, done, info = env.step(action)
    env.render()
+    if done:
+        obs = env.reset()
 ```

 Or just train a model with a one liner if [the environment is registered in Gym](https://github.com/openai/gym/wiki/Environments) and if [the policy is registered](https://stable-baselines.readthedocs.io/en/master/guide/custom_policy.html):
--- a/stable_baselines3/a2c/init.py
+++ b/stable_baselines3/a2c/init.py
@ -1,2 +1,2 @@
 from stable_baselines3.a2c.a2c import A2C
-from stable_baselines3.ppo.policies import MlpPolicy
+from stable_baselines3.ppo.policies import MlpPolicy, CnnPolicy
--- a/stable_baselines3/ppo/init.py
+++ b/stable_baselines3/ppo/init.py
@ -1,2 +1,2 @@
 from stable_baselines3.ppo.ppo import PPO
-from stable_baselines3.ppo.policies import MlpPolicy
+from stable_baselines3.ppo.policies import MlpPolicy, CnnPolicy
--- a/stable_baselines3/sac/init.py
+++ b/stable_baselines3/sac/init.py
@ -1,2 +1,2 @@
 from stable_baselines3.sac.sac import SAC
-from stable_baselines3.sac.policies import MlpPolicy
+from stable_baselines3.sac.policies import MlpPolicy, CnnPolicy
--- a/stable_baselines3/td3/init.py
+++ b/stable_baselines3/td3/init.py
@ -1,2 +1,2 @@
 from stable_baselines3.td3.td3 import TD3
-from stable_baselines3.td3.policies import MlpPolicy
+from stable_baselines3.td3.policies import MlpPolicy, CnnPolicy