mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-07-03 03:59:13 +00:00
Re-sample noise matrix for PPO
This commit is contained in:
parent
e894f1f11b
commit
161c608f9c
1 changed files with 7 additions and 1 deletions
|
|
@ -56,7 +56,7 @@ class PPO(BaseRLModel):
|
|||
:param use_sde: (bool) Whether to use State Dependent Exploration (SDE)
|
||||
instead of action noise exploration (default: False)
|
||||
:param sde_sample_freq: (int) Sample a new noise matrix every n steps when using SDE
|
||||
Default: -1 (only sample at the beginning of the rollout)
|
||||
Default: -1 (only sample at the beginning of the rollout)
|
||||
:param target_kl: (float) Limit the KL divergence between updates,
|
||||
because the clipping is not enough to prevent large update
|
||||
see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
|
||||
|
|
@ -210,6 +210,12 @@ class PPO(BaseRLModel):
|
|||
# Convert discrete action for float to long
|
||||
action = action.long().flatten()
|
||||
|
||||
# Re-sample the noise matrix because the log_std has changed
|
||||
# TODO: investigate why there is no issue with the gradient
|
||||
# if that line is commented (as in SAC)
|
||||
if self.use_sde:
|
||||
self.policy.reset_noise(batch_size)
|
||||
|
||||
values, log_prob, entropy = self.policy.evaluate_actions(obs, action)
|
||||
values = values.flatten()
|
||||
# Normalize advantage
|
||||
|
|
|
|||
Loading…
Reference in a new issue