mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-05-14 20:58:03 +00:00
Update GAE computation docstring (#655)
* Fix typo in buffers.py * Revert "Fix typo in buffers.py" This reverts commit ca643d5e3a509ae1b8a65bf0de98f4609ca9d8da. * Ignore pytype errors * Update GAE computation docstring Co-authored-by: Antonin Raffin <antonin.raffin@ensta.org>
This commit is contained in:
parent
b37052cbf0
commit
3b68dc7312
4 changed files with 7 additions and 5 deletions
|
|
@ -34,6 +34,7 @@ Documentation:
|
|||
- Add highway-env to projects page (@eleurent)
|
||||
- Add tactile-gym to projects page (@ac-93)
|
||||
- Fix indentation in the RL tips page (@cove9988)
|
||||
- Update GAE computation docstring
|
||||
|
||||
|
||||
Release 1.3.0 (2021-10-23)
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ filterwarnings =
|
|||
|
||||
[pytype]
|
||||
inputs = stable_baselines3
|
||||
disable = pyi-error
|
||||
|
||||
[flake8]
|
||||
ignore = W503,W504,E203,E231 # line breaks before and after binary operators
|
||||
|
|
|
|||
|
|
@ -352,9 +352,9 @@ class RolloutBuffer(BaseBuffer):
|
|||
and GAE(lambda) advantage.
|
||||
|
||||
Uses Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
|
||||
to compute the advantage. To obtain vanilla advantage (A(s) = R - V(S))
|
||||
where R is the discounted reward with value bootstrap,
|
||||
set ``gae_lambda=1.0`` during initialization.
|
||||
to compute the advantage. To obtain Monte-Carlo advantage estimate (A(s) = R - V(S))
|
||||
where R is the sum of discounted reward with value bootstrap
|
||||
(because we don't always have full episode), set ``gae_lambda=1.0`` during initialization.
|
||||
|
||||
The TD(lambda) estimator has also two special cases:
|
||||
- TD(1) is Monte-Carlo estimate (sum of discounted rewards)
|
||||
|
|
@ -364,7 +364,6 @@ class RolloutBuffer(BaseBuffer):
|
|||
|
||||
:param last_values: state value estimation for the last step (one for each env)
|
||||
:param dones: if the last step was a terminal step (one bool for each env).
|
||||
|
||||
"""
|
||||
# Convert to numpy
|
||||
last_values = last_values.clone().cpu().numpy().flatten()
|
||||
|
|
@ -623,7 +622,7 @@ class DictRolloutBuffer(RolloutBuffer):
|
|||
:param action_space: Action space
|
||||
:param device:
|
||||
:param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
|
||||
Equivalent to classic advantage when set to 1.
|
||||
Equivalent to Monte-Carlo advantage estimate when set to 1.
|
||||
:param gamma: Discount factor
|
||||
:param n_envs: Number of parallel environments
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ class StackedObservations(object):
|
|||
|
||||
:param num_envs: number of environments
|
||||
:param n_stack: Number of frames to stack
|
||||
:param observation_space: Environment observation space.
|
||||
:param channels_order: If "first", stack on first image dimension. If "last", stack on last dimension.
|
||||
If None, automatically detect channel to stack over in case of image observation or default to "last" (default).
|
||||
"""
|
||||
|
|
|
|||
Loading…
Reference in a new issue