From 494ebfd20abe90acc136fdaf215c76ec566acd2c Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 10 Jun 2020 18:58:35 +0200
Subject: [PATCH] Hotfix PPO + gSDE (#53)

* Fix variable being passed with gradients

* Update changelog

* Bump version

* Fixes #54
---
 docs/misc/changelog.rst                   | 17 ++++++++++-------
 stable_baselines3/common/distributions.py |  2 +-
 stable_baselines3/version.txt             |  2 +-
 tests/test_vec_normalize.py               | 10 ++++++----
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index c0e19c4..4f3db8a 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -4,22 +4,24 @@ Changelog
 ==========
 
 
-Pre-Release 0.7.0a1 (WIP)
+Pre-Release 0.7.0 (2020-06-10)
 ------------------------------
 
+**Hotfix for PPO/A2C + gSDE, internal refactoring and bug fixes**
+
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 - ``render()`` method of ``VecEnvs`` now only accept one argument: ``mode``
 - Created new file common/torch_layers.py, similar to SB refactoring
-  
+
   - Contains all PyTorch network layer definitions and feature extractors: ``MlpExtractor``, ``create_mlp``, ``NatureCNN``
 
 - Renamed ``BaseRLModel`` to ``BaseAlgorithm`` (along with offpolicy and onpolicy variants)
 - Moved on-policy and off-policy base algorithms to ``common/on_policy_algorithm.py`` and ``common/off_policy_algorithm.py``, respectively.
-- Moved ``PPOPolicy`` to ``ActorCriticPolicy`` in common/policies.py    
+- Moved ``PPOPolicy`` to ``ActorCriticPolicy`` in common/policies.py
 - Moved ``PPO`` (algorithm class) into ``OnPolicyAlgorithm`` (``common/on_policy_algorithm.py``), to be shared with A2C
-- Moved following functions from ``BaseAlgorithm``: 
-  
+- Moved following functions from ``BaseAlgorithm``:
+
   - ``_load_from_file`` to ``load_from_zip_file`` (save_util.py)
   - ``_save_to_file_zip`` to ``save_to_zip_file`` (save_util.py)
   - ``safe_mean`` to ``safe_mean`` (utils.py)
@@ -28,7 +30,7 @@ Breaking Changes:
 - Moved static function ``_is_vectorized_observation`` from common/policies.py to common/utils.py under name ``is_vectorized_observation``.
 - Removed ``{save,load}_running_average`` functions of ``VecNormalize`` in favor of ``load/save``.
 - Removed ``use_gae`` parameter from ``RolloutBuffer.compute_returns_and_advantage``.
- 
+
 New Features:
 ^^^^^^^^^^^^^
 
@@ -38,6 +40,7 @@ Bug Fixes:
 - Fixed ``seed()`` method for ``SubprocVecEnv``
 - Fixed loading on GPU for testing when using gSDE and ``deterministic=False``
 - Fixed ``register_policy`` to allow re-registering same policy for same sub-class (i.e. assign same value to same key).
+- Fixed a bug where the gradient was passed when using ``gSDE`` with ``PPO``/``A2C``, this does not affect ``SAC``
 
 Deprecations:
 ^^^^^^^^^^^^^
@@ -67,7 +70,7 @@ Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 - Remove State-Dependent Exploration (SDE) support for ``TD3``
 - Methods were renamed in the logger:
-  
+
   - ``logkv`` -> ``record``, ``writekvs`` -> ``write``, ``writeseq`` ->  ``write_sequence``,
   - ``logkvs`` -> ``record_dict``, ``dumpkvs`` -> ``dump``,
   - ``getkvs`` -> ``get_log_dict``, ``logkv_mean`` -> ``record_mean``,
diff --git a/stable_baselines3/common/distributions.py b/stable_baselines3/common/distributions.py
index f9bb16c..951f163 100644
--- a/stable_baselines3/common/distributions.py
+++ b/stable_baselines3/common/distributions.py
@@ -525,7 +525,7 @@ class StateDependentNoiseDistribution(Distribution):
         """
         # Stop gradient if we don't want to influence the features
         self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
-        variance = th.mm(latent_sde ** 2, self.get_std(log_std) ** 2)
+        variance = th.mm(self._latent_sde ** 2, self.get_std(log_std) ** 2)
         self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon))
         return self
 
diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt
index cde2c3f..faef31a 100644
--- a/stable_baselines3/version.txt
+++ b/stable_baselines3/version.txt
@@ -1 +1 @@
-0.7.0a1
+0.7.0
diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
index 0765967..124c5e5 100644
--- a/tests/test_vec_normalize.py
+++ b/tests/test_vec_normalize.py
@@ -1,5 +1,3 @@
-import os
-
 import gym
 import pytest
 import numpy as np
@@ -138,7 +136,7 @@ def test_sync_vec_normalize():
 
     assert unwrap_vec_normalize(env) is None
 
-    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
+    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100., clip_reward=100.)
 
     assert isinstance(unwrap_vec_normalize(env), VecNormalize)
 
@@ -147,9 +145,13 @@ def test_sync_vec_normalize():
     assert isinstance(unwrap_vec_normalize(env), VecNormalize)
 
     eval_env = DummyVecEnv([make_env])
-    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
+    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True,
+                            clip_obs=100., clip_reward=100.)
     eval_env = VecFrameStack(eval_env, 1)
 
+    env.seed(0)
+    env.action_space.seed(0)
+
     env.reset()
     # Initialize running mean
     latest_reward = None