Update GAE computation docstring (#655)

* Fix typo in buffers.py

* Revert "Fix typo in buffers.py"

This reverts commit ca643d5e3a509ae1b8a65bf0de98f4609ca9d8da.

* Ignore pytype errors

* Update GAE computation docstring

Co-authored-by: Antonin Raffin <antonin.raffin@ensta.org>
This commit is contained in:
Shyamal H Anadkat 2021-11-25 04:53:42 -05:00 committed by GitHub
parent b37052cbf0
commit 3b68dc7312
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 7 additions and 5 deletions

View file

@ -34,6 +34,7 @@ Documentation:
- Add highway-env to projects page (@eleurent)
- Add tactile-gym to projects page (@ac-93)
- Fix indentation in the RL tips page (@cove9988)
- Update GAE computation docstring
Release 1.3.0 (2021-10-23)

View file

@ -16,6 +16,7 @@ filterwarnings =
[pytype]
inputs = stable_baselines3
disable = pyi-error
[flake8]
ignore = W503,W504,E203,E231 # line breaks before and after binary operators

View file

@ -352,9 +352,9 @@ class RolloutBuffer(BaseBuffer):
and GAE(lambda) advantage.
Uses Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
to compute the advantage. To obtain vanilla advantage (A(s) = R - V(S))
where R is the discounted reward with value bootstrap,
set ``gae_lambda=1.0`` during initialization.
to compute the advantage. To obtain Monte-Carlo advantage estimate (A(s) = R - V(S))
where R is the sum of discounted reward with value bootstrap
(because we don't always have full episode), set ``gae_lambda=1.0`` during initialization.
The TD(lambda) estimator has also two special cases:
- TD(1) is Monte-Carlo estimate (sum of discounted rewards)
@ -364,7 +364,6 @@ class RolloutBuffer(BaseBuffer):
:param last_values: state value estimation for the last step (one for each env)
:param dones: if the last step was a terminal step (one bool for each env).
"""
# Convert to numpy
last_values = last_values.clone().cpu().numpy().flatten()
@ -623,7 +622,7 @@ class DictRolloutBuffer(RolloutBuffer):
:param action_space: Action space
:param device:
:param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
Equivalent to classic advantage when set to 1.
Equivalent to Monte-Carlo advantage estimate when set to 1.
:param gamma: Discount factor
:param n_envs: Number of parallel environments
"""

View file

@ -18,6 +18,7 @@ class StackedObservations(object):
:param num_envs: number of environments
:param n_stack: Number of frames to stack
:param observation_space: Environment observation space.
:param channels_order: If "first", stack on first image dimension. If "last", stack on last dimension.
If None, automatically detect channel to stack over in case of image observation or default to "last" (default).
"""