Re-sample noise matrix for PPO

2026-07-12 17:58:00 +00:00 · 2019-12-20 11:28:20 +01:00 · 2019-12-20 11:28:20 +01:00 · 161c608f9c
commit 161c608f9c
parent e894f1f11b
1 changed files with 7 additions and 1 deletions
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@ -56,7 +56,7 @@ class PPO(BaseRLModel):
    :param use_sde: (bool) Whether to use State Dependent Exploration (SDE)
        instead of action noise exploration (default: False)
    :param sde_sample_freq: (int) Sample a new noise matrix every n steps when using SDE
-        Default: -1 (only sample at the beginning of the rollout)    
+        Default: -1 (only sample at the beginning of the rollout)
    :param target_kl: (float) Limit the KL divergence between updates,
        because the clipping is not enough to prevent large update
        see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
@ -210,6 +210,12 @@ class PPO(BaseRLModel):
                    # Convert discrete action for float to long
                    action = action.long().flatten()

+                # Re-sample the noise matrix because the log_std has changed
+                # TODO: investigate why there is no issue with the gradient
+                # if that line is commented (as in SAC)
+                if self.use_sde:
+                    self.policy.reset_noise(batch_size)
+
                values, log_prob, entropy = self.policy.evaluate_actions(obs, action)
                values = values.flatten()
                # Normalize advantage