From c15b4bda1ef8e4dd821b758ef7532165ab82c487 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Mon, 28 Oct 2019 18:24:13 +0100
Subject: [PATCH 01/19] Add first draft of SDE

---
 tests/test_sde.py                        | 47 +++++++++++++++++
 torchy_baselines/a2c/a2c.py              |  9 +++-
 torchy_baselines/common/distributions.py | 67 +++++++++++++++++++++++-
 torchy_baselines/ppo/policies.py         | 37 +++++++++----
 torchy_baselines/ppo/ppo.py              | 12 ++++-
 5 files changed, 156 insertions(+), 16 deletions(-)
 create mode 100644 tests/test_sde.py

diff --git a/tests/test_sde.py b/tests/test_sde.py
new file mode 100644
index 0000000..7874ae9
--- /dev/null
+++ b/tests/test_sde.py
@@ -0,0 +1,47 @@
+import pytest
+
+import torch as th
+from torch.distributions import Normal
+
+from torchy_baselines import A2C
+
+
+def test_state_dependent_exploration():
+    state_dim = 3
+    # TODO: fix for action_dim > 1
+    action_dim = 1
+    sigma = th.ones(state_dim, action_dim, requires_grad=True)
+
+    # log_sigma = th.ones(2, 1, requires_grad=True)
+
+    # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
+    th.manual_seed(2)
+    weights_dist = Normal(th.zeros_like(sigma), sigma)
+
+    weights = weights_dist.rsample()
+    state = th.rand(1, state_dim)
+    # state = (th.ones(state_dim,) * 2).view(1, -1)
+    mu = th.ones(action_dim)
+    # print(weights.shape, state.shape)
+    noise = th.mm(state, weights)
+    # variance = th.mm(state ** 2, th.exp(log_sigma) ** 2)
+    variance = th.mm(state ** 2, sigma ** 2)
+    action_dist = Normal(mu, th.sqrt(variance))
+
+    loss = action_dist.log_prob((mu + noise).detach()).mean()
+    loss.backward()
+
+    # From Rueckstiess paper
+    grad = th.zeros_like(sigma)
+    for j in range(action_dim):
+        for i in range(state_dim):
+            grad[i, j] = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j])
+
+    # sigma.grad should be equal to grad
+    assert sigma.grad.allclose(grad)
+
+
+@pytest.mark.parametrize("model_class", [A2C])
+def test_state_dependent_noise(model_class):
+    model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200, use_sde=True, verbose=1, create_eval_env=True)
+    model.learn(total_timesteps=int(1e6), log_interval=10, eval_freq=10000)
diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 6ee6f4a..3aa1589 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -5,6 +5,7 @@ import torch.nn.functional as F
 from torchy_baselines.common.utils import explained_variance
 from torchy_baselines.ppo.ppo import PPO
 from torchy_baselines.ppo.policies import PPOPolicy
+from torchy_baselines.common import logger
 
 
 class A2C(PPO):
@@ -30,6 +31,8 @@ class A2C(PPO):
     :param rms_prop_eps: (float) RMSProp epsilon. It stabilizes square root computation in denominator
         of RMSProp update
     :param use_rms_prop: (bool) Whether to use RMSprop (default) or Adam as optimizer
+    :param use_sde: (bool) Whether to use State Dependent Exploration (SDE)
+        instead of action noise exploration (default: False)
     :param normalize_advantage: (bool) Whether to normalize or not the advantage
     :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
     :param create_eval_env: (bool) Whether to create a second environment that will be
@@ -45,7 +48,7 @@ class A2C(PPO):
     def __init__(self, policy, env, learning_rate=7e-4,
                  n_steps=5, gamma=0.99, gae_lambda=1.0,
                  ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5,
-                 rms_prop_eps=1e-5, use_rms_prop=True,
+                 rms_prop_eps=1e-5, use_rms_prop=True, use_sde=False,
                  normalize_advantage=False, tensorboard_log=None, create_eval_env=False,
                  policy_kwargs=None, verbose=0, seed=0, device='auto',
                  _init_setup_model=True):
@@ -53,7 +56,7 @@ class A2C(PPO):
         super(A2C, self).__init__(policy, env, learning_rate=learning_rate,
                                   n_steps=n_steps, batch_size=None, n_epochs=1,
                                   gamma=gamma, gae_lambda=gae_lambda, ent_coef=ent_coef,
-                                  vf_coef=vf_coef, max_grad_norm=max_grad_norm,
+                                  vf_coef=vf_coef, max_grad_norm=max_grad_norm, use_sde=use_sde,
                                   tensorboard_log=tensorboard_log, policy_kwargs=policy_kwargs,
                                   verbose=verbose, device=device, create_eval_env=create_eval_env,
                                   seed=seed, _init_setup_model=False)
@@ -73,6 +76,8 @@ class A2C(PPO):
                                                      eps=self.rms_prop_eps, weight_decay=0)
 
     def train(self, gradient_steps, batch_size=None):
+        if self.use_sde:
+            logger.logkv("noise net std", th.exp(self.policy.log_std).mean().item())
 
         # Update optimizer learning rate
         self._update_learning_rate(self.policy.optimizer)
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index a384ce3..d420e03 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -165,15 +165,80 @@ class CategoricalDistribution(Distribution):
         return log_prob
 
 
-def make_proba_distribution(action_space):
+class StateDependentNoiseDistribution(Distribution):
+    def __init__(self, features_dim, action_dim):
+        super(StateDependentNoiseDistribution, self).__init__()
+        self.distribution = None
+        self.action_dim = action_dim
+        self.features_dim = features_dim
+        self.mean_actions = None
+        self.log_std = None
+        self.weights_dist = None
+        self.noise_weights = None
+
+    @staticmethod
+    def get_std(log_std):
+        # TODO: use expln instead of exp only to avoid sigma growing too fast
+        return th.exp(log_std)
+
+    def sample_weights(self, log_std):
+        self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std))
+        self.noise_weights = self.weights_dist.rsample()
+
+    def proba_distribution_net(self, latent_dim, log_std_init=0.0):
+        mean_actions = nn.Linear(latent_dim, self.action_dim)
+        log_std = nn.Parameter(th.zeros(self.features_dim, self.action_dim))
+        self.sample_weights(log_std)
+        return mean_actions, log_std
+
+    def proba_distribution(self, mean_actions, log_std, observations, deterministic=False):
+        variance = th.mm(observations ** 2, self.get_std(log_std) ** 2)
+        self.distribution = Normal(mean_actions, th.sqrt(variance))
+
+        if deterministic:
+            action = self.mode()
+        else:
+            action = self.sample(observations)
+        return action, self
+
+    def mode(self):
+        return self.distribution.mean
+
+    def sample(self, observations):
+        noise = th.mm(observations, self.noise_weights)
+        return self.distribution.mean + noise
+
+    def entropy(self):
+        return self.distribution.entropy()
+
+    def log_prob_from_params(self, mean_actions, log_std, observations):
+        action, _ = self.proba_distribution(mean_actions, log_std, observations)
+        log_prob = self.log_prob(action)
+        return action, log_prob
+
+    def log_prob(self, action):
+        log_prob = self.distribution.log_prob(action)
+        if len(log_prob.shape) > 1:
+            log_prob = log_prob.sum(axis=1)
+        else:
+            log_prob = log_prob.sum()
+        return log_prob
+
+
+def make_proba_distribution(action_space, features_dim=None, use_sde=False):
     """
     Return an instance of Distribution for the correct type of action space
 
     :param action_space: (Gym Space) the input action space
+    :param feature_dim: (int) Dimension of the feature vector
+    :param use_sde: (bool) Force the use of StateDependentNoiseDistribution
+        instead of DiagGaussianDistribution
     :return: (Distribution) the approriate Distribution object
     """
     if isinstance(action_space, spaces.Box):
         assert len(action_space.shape) == 1, "Error: the action space must be a vector"
+        if use_sde:
+            return StateDependentNoiseDistribution(features_dim, action_space.shape[0])
         return DiagGaussianDistribution(action_space.shape[0])
     elif isinstance(action_space, spaces.Discrete):
         return CategoricalDistribution(action_space.n)
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index e973858..08dd7dd 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -6,7 +6,8 @@ import torch.nn as nn
 import numpy as np
 
 from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp
-from torchy_baselines.common.distributions import make_proba_distribution, DiagGaussianDistribution, CategoricalDistribution
+from torchy_baselines.common.distributions import make_proba_distribution,\
+    DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution
 
 
 class MlpExtractor(nn.Module):
@@ -101,7 +102,8 @@ class MlpExtractor(nn.Module):
 class PPOPolicy(BasePolicy):
     def __init__(self, observation_space, action_space,
                  learning_rate, net_arch=None, device='cpu',
-                 activation_fn=nn.Tanh, adam_epsilon=1e-5, ortho_init=True):
+                 activation_fn=nn.Tanh, adam_epsilon=1e-5,
+                 ortho_init=True, use_sde=False):
         super(PPOPolicy, self).__init__(observation_space, action_space, device)
         self.obs_dim = self.observation_space.shape[0]
         if net_arch is None:
@@ -118,20 +120,31 @@ class PPOPolicy(BasePolicy):
         }
         self.shared_net = None
         self.pi_net, self.vf_net = None, None
-        # Action distribution
-        self.action_dist = make_proba_distribution(action_space)
         # In the future, feature_extractor will be replaced with a CNN
         self.features_extractor = nn.Flatten()
         self.features_dim = self.obs_dim
+        # Action distribution
+        self.action_dist = make_proba_distribution(action_space, self.features_dim, use_sde=use_sde)
+
         self._build(learning_rate)
 
+    def reset_noise_net(self):
+        self.action_dist.sample_weights(self.log_std)
+        # weights_dist = Normal(th.zeros_like(self.noise_log_sigma), th.exp(self.noise_log_sigma))
+        # self.noise_net = weights_dist.rsample()
+        # noise = th.mm(state, weights)
+        # variance = th.mm(state ** 2, sigma ** 2)
+        # action_dist = Normal(mu, th.sqrt(variance))
+        # # action_dist.log_prob((mu + noise).detach())
+        # action_dist.log_prob(action)
+        # # action_dist = Normal(mu_j + noise_j, sum of s_i * sigma_ij)
+        # # log_prob = distribution.log_prob(self.noise_net)
+
     def _build(self, learning_rate):
         self.mlp_extractor = MlpExtractor(self.features_dim, net_arch=self.net_arch,
                                           activation_fn=self.activation_fn, device=self.device)
 
-        # self.action_net = nn.Linear(self.net_arch[-1], self.action_dim)
-        # self.log_std = nn.Parameter(th.zeros(self.action_dim))
-        if isinstance(self.action_dist, DiagGaussianDistribution):
+        if isinstance(self.action_dist, (DiagGaussianDistribution, StateDependentNoiseDistribution)):
             self.action_net, self.log_std = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi)
         elif isinstance(self.action_dist, CategoricalDistribution):
             self.action_net = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi)
@@ -155,28 +168,30 @@ class PPOPolicy(BasePolicy):
             obs = th.FloatTensor(obs).to(self.device)
         latent_pi, latent_vf = self._get_latent(obs)
         value = self.value_net(latent_vf)
-        action, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
+        action, action_distribution = self._get_action_dist_from_latent(latent_pi, obs, deterministic=deterministic)
         log_prob = action_distribution.log_prob(action)
         return action, value, log_prob
 
     def _get_latent(self, obs):
         return self.mlp_extractor(self.features_extractor(obs))
 
-    def _get_action_dist_from_latent(self, latent, deterministic=False):
+    def _get_action_dist_from_latent(self, latent, obs, deterministic=False):
         mean_actions = self.action_net(latent)
         if isinstance(self.action_dist, DiagGaussianDistribution):
             return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic)
         elif isinstance(self.action_dist, CategoricalDistribution):
             return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic)
+        elif isinstance(self.action_dist, StateDependentNoiseDistribution):
+            return self.action_dist.proba_distribution(mean_actions, self.log_std, obs, deterministic=deterministic)
 
     def actor_forward(self, obs, deterministic=False):
         latent_pi, _ = self._get_latent(obs)
-        action, _ = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
+        action, _ = self._get_action_dist_from_latent(latent_pi, obs, deterministic=deterministic)
         return action.detach().cpu().numpy()
 
     def get_policy_stats(self, obs, action):
         latent_pi, latent_vf = self._get_latent(obs)
-        _, action_distribution = self._get_action_dist_from_latent(latent_pi)
+        _, action_distribution = self._get_action_dist_from_latent(latent_pi, obs)
         log_prob = action_distribution.log_prob(action)
         value = self.value_net(latent_vf)
         return value, log_prob, action_distribution.entropy()
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 13a1634..7fa318d 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -52,6 +52,8 @@ class PPO(BaseRLModel):
     :param ent_coef: (float) Entropy coefficient for the loss calculation
     :param vf_coef: (float) Value function coefficient for the loss calculation
     :param max_grad_norm: (float) The maximum value for the gradient clipping
+    :param use_sde: (bool) Whether to use State Dependent Exploration (SDE)
+        instead of action noise exploration (default: False)
     :param target_kl: (float) Limit the KL divergence between updates,
         because the clipping is not enough to prevent large update
         see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
@@ -70,7 +72,7 @@ class PPO(BaseRLModel):
     def __init__(self, policy, env, learning_rate=3e-4,
                  n_steps=2048, batch_size=64, n_epochs=10,
                  gamma=0.99, gae_lambda=0.95, clip_range=0.2, clip_range_vf=None,
-                 ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5,
+                 ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, use_sde=False,
                  target_kl=None, tensorboard_log=None, create_eval_env=False,
                  policy_kwargs=None, verbose=0, seed=0, device='auto',
                  _init_setup_model=True):
@@ -94,6 +96,7 @@ class PPO(BaseRLModel):
         self.target_kl = target_kl
         self.tensorboard_log = tensorboard_log
         self.tb_writer = None
+        self.use_sde = use_sde
 
         if _init_setup_model:
             self._setup_model()
@@ -116,7 +119,8 @@ class PPO(BaseRLModel):
         self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device,
                                             gamma=self.gamma, gae_lambda=self.gae_lambda, n_envs=self.n_envs)
         self.policy = self.policy(self.observation_space, self.action_space,
-                                  self.learning_rate, device=self.device, **self.policy_kwargs)
+                                  self.learning_rate, use_sde=self.use_sde, device=self.device,
+                                  **self.policy_kwargs)
         self.policy = self.policy.to(self.device)
 
         self.clip_range = get_schedule_fn(self.clip_range)
@@ -150,6 +154,10 @@ class PPO(BaseRLModel):
 
         n_steps = 0
         rollout_buffer.reset()
+        # Sample new weights for the state dependent exploration
+        # TODO: ensure episodic setting?
+        if self.use_sde:
+            self.policy.reset_noise_net()
 
         while n_steps < n_rollout_steps:
             with th.no_grad():

From 69a348276edfda1eabbde6e9b8ec29ea1a2f71bf Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 29 Oct 2019 12:36:40 +0100
Subject: [PATCH 02/19] Add classic advantage computation

---
 torchy_baselines/common/buffers.py | 48 +++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/torchy_baselines/common/buffers.py b/torchy_baselines/common/buffers.py
index 34a7098..7ca61b8 100644
--- a/torchy_baselines/common/buffers.py
+++ b/torchy_baselines/common/buffers.py
@@ -113,22 +113,42 @@ class RolloutBuffer(BaseBuffer):
         self.generator_ready = False
         super(RolloutBuffer, self).reset()
 
-    def compute_returns_and_advantage(self, last_value, dones=False):
+    def compute_returns_and_advantage(self, last_value, dones=False, use_gae=True):
         """
-        From PPO2
+        From Stable-Baselines PPO2
+        :param last_value: (th.Tensor)
+        :param dones: ([bool])
+        :param use_gae: (bool) Whether to use Generalized Advantage Estimation
+            or normal advantage for advantage computation.
         """
-        last_gae_lam = 0
-        for step in reversed(range(self.buffer_size)):
-            if step == self.buffer_size - 1:
-                next_non_terminal = th.FloatTensor(1.0 - dones)
-                next_value = last_value.clone().cpu().flatten()
-            else:
-                next_non_terminal = 1.0 - self.dones[step + 1]
-                next_value = self.values[step + 1]
-            delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
-            last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
-            self.advantages[step] = last_gae_lam
-        self.returns = self.advantages + self.values
+        if use_gae:
+            last_gae_lam = 0
+            for step in reversed(range(self.buffer_size)):
+                if step == self.buffer_size - 1:
+                    next_non_terminal = th.FloatTensor(1.0 - dones)
+                    next_value = last_value.clone().cpu().flatten()
+                else:
+                    next_non_terminal = 1.0 - self.dones[step + 1]
+                    next_value = self.values[step + 1]
+                delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
+                last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
+                self.advantages[step] = last_gae_lam
+            self.returns = self.advantages + self.values
+        else:
+            # Discounted return with value bootstrap
+            # Note: this is equivalent to GAE computation
+            # with gae_lambda = 1.0
+            last_return = 0.0
+            for step in reversed(range(self.buffer_size)):
+                if step == self.buffer_size - 1:
+                    next_non_terminal = th.FloatTensor(1.0 - dones)
+                    next_value = last_value.clone().cpu().flatten()
+                    last_return = self.rewards[step] + next_non_terminal * next_value
+                else:
+                    next_non_terminal = 1.0 - self.dones[step + 1]
+                    last_return = self.rewards[step] + self.gamma * last_return * next_non_terminal
+                self.returns[step] = last_return
+            self.advantages = self.returns - self.values
 
     def add(self, obs, action, reward, done, value, log_prob):
         if len(log_prob.shape) == 0:

From 0d41bc13560ac8ed010d8bcc6407d3318a494646 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 29 Oct 2019 15:15:11 +0100
Subject: [PATCH 03/19] Add more logging

---
 torchy_baselines/a2c/a2c.py | 18 ++++++++++++------
 torchy_baselines/ppo/ppo.py | 10 ++++++++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 3aa1589..6b51227 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -76,9 +76,6 @@ class A2C(PPO):
                                                      eps=self.rms_prop_eps, weight_decay=0)
 
     def train(self, gradient_steps, batch_size=None):
-        if self.use_sde:
-            logger.logkv("noise net std", th.exp(self.policy.log_std).mean().item())
-
         # Update optimizer learning rate
         self._update_learning_rate(self.policy.optimizer)
         # A2C with gradient_steps > 1 does not make sense
@@ -118,10 +115,19 @@ class A2C(PPO):
             # Clip grad norm
             th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
             self.policy.optimizer.step()
-            # approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy())
 
-        # print(explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),
-        #                          self.rollout_buffer.values.flatten().cpu().numpy()))
+        explained_var = explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),
+                                           self.rollout_buffer.values.flatten().cpu().numpy())
+
+        logger.logkv("explained_variance", explained_var)
+        logger.logkv("entropy", entropy.mean().item())
+        logger.logkv("policy_loss", policy_loss.item())
+        logger.logkv("value_loss", value_loss.item())
+
+        if self.use_sde:
+            logger.logkv("noise net std", th.exp(self.policy.log_std).mean().item())
+            # print(th.exp(self.policy.log_std).detach())
+
 
     def learn(self, total_timesteps, callback=None, log_interval=100,
               eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True):
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 7fa318d..8171eb5 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -245,8 +245,14 @@ class PPO(BaseRLModel):
                 print("Early stopping at step {} due to reaching max kl: {:.2f}".format(it, np.mean(approx_kl_divs)))
                 break
 
-        # print(explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),
-        #                          self.rollout_buffer.values.flatten().cpu().numpy()))
+        explained_var = explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),
+                                           self.rollout_buffer.values.flatten().cpu().numpy())
+
+        logger.logkv("explained_variance", explained_var)
+        # TODO: gather stats for the entropy and other losses?
+        logger.logkv("entropy", entropy.mean().item())
+        logger.logkv("policy_loss", policy_loss.item())
+        logger.logkv("value_loss", value_loss.item())
 
     def learn(self, total_timesteps, callback=None, log_interval=1,
               eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True):

From 42d50ed09b29c1ca2b6870deac42a22f7ba95ee2 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 29 Oct 2019 15:15:54 +0100
Subject: [PATCH 04/19] Add expln

---
 tests/test_sde.py                        |  3 ++-
 torchy_baselines/common/distributions.py | 23 ++++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/tests/test_sde.py b/tests/test_sde.py
index 7874ae9..3c5db43 100644
--- a/tests/test_sde.py
+++ b/tests/test_sde.py
@@ -43,5 +43,6 @@ def test_state_dependent_exploration():
 
 @pytest.mark.parametrize("model_class", [A2C])
 def test_state_dependent_noise(model_class):
-    model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200, use_sde=True, verbose=1, create_eval_env=True)
+    model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200,
+                        use_sde=True, ent_coef=0.0, verbose=1, create_eval_env=True)
     model.learn(total_timesteps=int(1e6), log_interval=10, eval_freq=10000)
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index d420e03..2944d8c 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -166,7 +166,7 @@ class CategoricalDistribution(Distribution):
 
 
 class StateDependentNoiseDistribution(Distribution):
-    def __init__(self, features_dim, action_dim):
+    def __init__(self, features_dim, action_dim, use_expln=False):
         super(StateDependentNoiseDistribution, self).__init__()
         self.distribution = None
         self.action_dim = action_dim
@@ -175,19 +175,28 @@ class StateDependentNoiseDistribution(Distribution):
         self.log_std = None
         self.weights_dist = None
         self.noise_weights = None
+        self.use_expln = use_expln
 
-    @staticmethod
-    def get_std(log_std):
-        # TODO: use expln instead of exp only to avoid sigma growing too fast
-        return th.exp(log_std)
+    def get_std(self, log_std):
+        if self.use_expln:
+            # From SDE paper, it allows to keep variance
+            # above zero and prevent it from growing too fast 
+            if log_std <= 0:
+                return th.exp(log_std)
+            else:
+                return th.log(log_std + 1.0) + 1.0
+        else:
+            return th.exp(log_std)
 
     def sample_weights(self, log_std):
         self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std))
         self.noise_weights = self.weights_dist.rsample()
 
-    def proba_distribution_net(self, latent_dim, log_std_init=0.0):
+    def proba_distribution_net(self, latent_dim, log_std_init=-3):
+        print("Log std init:", log_std_init)
         mean_actions = nn.Linear(latent_dim, self.action_dim)
-        log_std = nn.Parameter(th.zeros(self.features_dim, self.action_dim))
+        # TODO: log_std_init depending on the number of layers?
+        log_std = nn.Parameter(th.ones(self.features_dim, self.action_dim) * log_std_init)
         self.sample_weights(log_std)
         return mean_actions, log_std
 

From c0cb9fc9c57e0a3fd14c5675c2bc206d64f562f4 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 29 Oct 2019 18:30:36 +0100
Subject: [PATCH 05/19] Fix predict method

---
 torchy_baselines/ppo/ppo.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 8171eb5..8127307 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -127,14 +127,14 @@ class PPO(BaseRLModel):
         if self.clip_range_vf is not None:
             self.clip_range_vf = get_schedule_fn(self.clip_range_vf)
 
-    def select_action(self, observation):
+    def select_action(self, observation, deterministic=False):
         # Normally not needed
         observation = np.array(observation)
         with th.no_grad():
             observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
-            return self.policy.actor_forward(observation, deterministic=False)
+            return self.policy.actor_forward(observation, deterministic=deterministic)
 
-    def predict(self, observation, state=None, mask=None, deterministic=True):
+    def predict(self, observation, state=None, mask=None, deterministic=False):
         """
         Get the model's action from an observation
 
@@ -144,7 +144,7 @@ class PPO(BaseRLModel):
         :param deterministic: (bool) Whether or not to return deterministic actions.
         :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
         """
-        clipped_actions = self.select_action(observation)
+        clipped_actions = self.select_action(observation, deterministic=deterministic)
         if isinstance(self.action_space, gym.spaces.Box):
             clipped_actions = np.clip(clipped_actions, self.action_space.low, self.action_space.high)
         return clipped_actions

From 9e8f6e00201251d3598e3c3ea279ed100456eac0 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 29 Oct 2019 18:42:34 +0100
Subject: [PATCH 06/19] Add default filename for monitor

---
 torchy_baselines/common/monitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchy_baselines/common/monitor.py b/torchy_baselines/common/monitor.py
index c241de9..53bea70 100644
--- a/torchy_baselines/common/monitor.py
+++ b/torchy_baselines/common/monitor.py
@@ -13,7 +13,7 @@ class Monitor(Wrapper):
     EXT = "monitor.csv"
     file_handler = None
 
-    def __init__(self, env, filename, allow_early_resets=True, reset_keywords=(), info_keywords=()):
+    def __init__(self, env, filename=None, allow_early_resets=True, reset_keywords=(), info_keywords=()):
         """
         A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data.
 

From 0174ec269e341acc15675f66ebb559100ee59662 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 29 Oct 2019 18:43:16 +0100
Subject: [PATCH 07/19] Clean up

---
 torchy_baselines/a2c/a2c.py              |  4 +++-
 torchy_baselines/common/distributions.py |  5 ++---
 torchy_baselines/ppo/policies.py         | 18 +++++++-----------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 6b51227..51eb846 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -112,6 +112,7 @@ class A2C(PPO):
             # Optimization step
             self.policy.optimizer.zero_grad()
             loss.backward()
+
             # Clip grad norm
             th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
             self.policy.optimizer.step()
@@ -123,9 +124,10 @@ class A2C(PPO):
         logger.logkv("entropy", entropy.mean().item())
         logger.logkv("policy_loss", policy_loss.item())
         logger.logkv("value_loss", value_loss.item())
+        logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
         if self.use_sde:
-            logger.logkv("noise net std", th.exp(self.policy.log_std).mean().item())
+            pass
             # print(th.exp(self.policy.log_std).detach())
 
 
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index 2944d8c..5c9cdac 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -180,7 +180,7 @@ class StateDependentNoiseDistribution(Distribution):
     def get_std(self, log_std):
         if self.use_expln:
             # From SDE paper, it allows to keep variance
-            # above zero and prevent it from growing too fast 
+            # above zero and prevent it from growing too fast
             if log_std <= 0:
                 return th.exp(log_std)
             else:
@@ -192,8 +192,7 @@ class StateDependentNoiseDistribution(Distribution):
         self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std))
         self.noise_weights = self.weights_dist.rsample()
 
-    def proba_distribution_net(self, latent_dim, log_std_init=-3):
-        print("Log std init:", log_std_init)
+    def proba_distribution_net(self, latent_dim, log_std_init=-1):
         mean_actions = nn.Linear(latent_dim, self.action_dim)
         # TODO: log_std_init depending on the number of layers?
         log_std = nn.Parameter(th.ones(self.features_dim, self.action_dim) * log_std_init)
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index 08dd7dd..25d0bf3 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -103,7 +103,7 @@ class PPOPolicy(BasePolicy):
     def __init__(self, observation_space, action_space,
                  learning_rate, net_arch=None, device='cpu',
                  activation_fn=nn.Tanh, adam_epsilon=1e-5,
-                 ortho_init=True, use_sde=False):
+                 ortho_init=True, use_sde=False, log_std_init=0.0):
         super(PPOPolicy, self).__init__(observation_space, action_space, device)
         self.obs_dim = self.observation_space.shape[0]
         if net_arch is None:
@@ -123,6 +123,7 @@ class PPOPolicy(BasePolicy):
         # In the future, feature_extractor will be replaced with a CNN
         self.features_extractor = nn.Flatten()
         self.features_dim = self.obs_dim
+        self.log_std_init = log_std_init
         # Action distribution
         self.action_dist = make_proba_distribution(action_space, self.features_dim, use_sde=use_sde)
 
@@ -130,22 +131,14 @@ class PPOPolicy(BasePolicy):
 
     def reset_noise_net(self):
         self.action_dist.sample_weights(self.log_std)
-        # weights_dist = Normal(th.zeros_like(self.noise_log_sigma), th.exp(self.noise_log_sigma))
-        # self.noise_net = weights_dist.rsample()
-        # noise = th.mm(state, weights)
-        # variance = th.mm(state ** 2, sigma ** 2)
-        # action_dist = Normal(mu, th.sqrt(variance))
-        # # action_dist.log_prob((mu + noise).detach())
-        # action_dist.log_prob(action)
-        # # action_dist = Normal(mu_j + noise_j, sum of s_i * sigma_ij)
-        # # log_prob = distribution.log_prob(self.noise_net)
 
     def _build(self, learning_rate):
         self.mlp_extractor = MlpExtractor(self.features_dim, net_arch=self.net_arch,
                                           activation_fn=self.activation_fn, device=self.device)
 
         if isinstance(self.action_dist, (DiagGaussianDistribution, StateDependentNoiseDistribution)):
-            self.action_net, self.log_std = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi)
+            self.action_net, self.log_std = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi,
+                                                                                    log_std_init=self.log_std_init)
         elif isinstance(self.action_dist, CategoricalDistribution):
             self.action_net = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi)
 
@@ -177,10 +170,13 @@ class PPOPolicy(BasePolicy):
 
     def _get_action_dist_from_latent(self, latent, obs, deterministic=False):
         mean_actions = self.action_net(latent)
+
         if isinstance(self.action_dist, DiagGaussianDistribution):
             return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic)
+
         elif isinstance(self.action_dist, CategoricalDistribution):
             return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic)
+
         elif isinstance(self.action_dist, StateDependentNoiseDistribution):
             return self.action_dist.proba_distribution(mean_actions, self.log_std, obs, deterministic=deterministic)
 

From 862ae666b531463158e901d163b7933e9622b10d Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 30 Oct 2019 15:30:09 +0100
Subject: [PATCH 08/19] Try squashing the sde

---
 torchy_baselines/common/distributions.py | 32 ++++++++++++++++++++----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index 5c9cdac..a62ede8 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -119,7 +119,9 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
         # Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x))
         # We use numpy to avoid numerical instability
         if gaussian_action is None:
-            gaussian_action = th.from_numpy(np.arctanh(action.cpu().numpy())).to(action.device)
+            # Clip to avoid NaN
+            clipped_action = np.clip(action.cpu().numpy(), -1.0 + self.epsilon, 1.0 + self.epsilon)
+            gaussian_action = th.from_numpy(np.arctanh(clipped_action)).to(action.device)
 
         # Log likelihood for a gaussian distribution
         log_prob = super(SquashedDiagGaussianDistribution, self).log_prob(gaussian_action)
@@ -166,7 +168,8 @@ class CategoricalDistribution(Distribution):
 
 
 class StateDependentNoiseDistribution(Distribution):
-    def __init__(self, features_dim, action_dim, use_expln=False):
+    def __init__(self, features_dim, action_dim, use_expln=False,
+                 squash_output=True, epsilon=1e-6):
         super(StateDependentNoiseDistribution, self).__init__()
         self.distribution = None
         self.action_dim = action_dim
@@ -175,7 +178,10 @@ class StateDependentNoiseDistribution(Distribution):
         self.log_std = None
         self.weights_dist = None
         self.noise_weights = None
+        self.gaussian_action = None
         self.use_expln = use_expln
+        self.squash_output = squash_output
+        self.epsilon = epsilon
 
     def get_std(self, log_std):
         if self.use_expln:
@@ -210,13 +216,20 @@ class StateDependentNoiseDistribution(Distribution):
         return action, self
 
     def mode(self):
-        return self.distribution.mean
+        self.gaussian_action = self.distribution.mean
+        if self.squash_output:
+            return th.tanh(self.gaussian_action)
+        return self.gaussian_action
 
     def sample(self, observations):
         noise = th.mm(observations, self.noise_weights)
-        return self.distribution.mean + noise
+        self.gaussian_action = self.distribution.mean + noise
+        if self.squash_output:
+            return th.tanh(self.gaussian_action)
+        return self.gaussian_action
 
     def entropy(self):
+        # TODO: account for the squashing?
         return self.distribution.entropy()
 
     def log_prob_from_params(self, mean_actions, log_std, observations):
@@ -225,11 +238,20 @@ class StateDependentNoiseDistribution(Distribution):
         return action, log_prob
 
     def log_prob(self, action):
-        log_prob = self.distribution.log_prob(action)
+        if self.squash_output:
+            gaussian_action = self.gaussian_action
+        else:
+            gaussian_action = action
+        # log likelihood for a gaussian
+        log_prob = self.distribution.log_prob(gaussian_action)
+        # log_prob = self.distribution.log_prob(action)
         if len(log_prob.shape) > 1:
             log_prob = log_prob.sum(axis=1)
         else:
             log_prob = log_prob.sum()
+        if self.squash_output:
+            # Squash correction (from original SAC implementation)
+            log_prob -= th.sum(th.log(1 - action ** 2 + self.epsilon), dim=1)
         return log_prob
 
 

From 925afe784c595c114d5b3c5f4e81766c8297dca3 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 31 Oct 2019 11:44:27 +0100
Subject: [PATCH 09/19] SDE on latent_pi

---
 torchy_baselines/a2c/a2c.py              |  1 -
 torchy_baselines/common/distributions.py | 97 ++++++++++++++++--------
 torchy_baselines/ppo/policies.py         | 16 ++--
 3 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 51eb846..d355e09 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -112,7 +112,6 @@ class A2C(PPO):
             # Optimization step
             self.policy.optimizer.zero_grad()
             loss.backward()
-
             # Clip grad norm
             th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
             self.policy.optimizer.step()
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index a62ede8..72bf7b2 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -2,6 +2,7 @@ import numpy as np
 import torch as th
 import torch.nn as nn
 from torch.distributions import Normal, Categorical
+import torch.nn.functional as F
 from gym import spaces
 
 class Distribution(object):
@@ -168,20 +169,20 @@ class CategoricalDistribution(Distribution):
 
 
 class StateDependentNoiseDistribution(Distribution):
-    def __init__(self, features_dim, action_dim, use_expln=False,
-                 squash_output=True, epsilon=1e-6):
+    def __init__(self, action_dim, use_expln=False,
+                 squash_output=False, epsilon=1e-6):
         super(StateDependentNoiseDistribution, self).__init__()
         self.distribution = None
         self.action_dim = action_dim
-        self.features_dim = features_dim
         self.mean_actions = None
         self.log_std = None
         self.weights_dist = None
-        self.noise_weights = None
-        self.gaussian_action = None
+        self.exploration_mat = None
         self.use_expln = use_expln
-        self.squash_output = squash_output
-        self.epsilon = epsilon
+        if squash_output:
+            self.bijector = TanhBijector(epsilon)
+        else:
+            self.bijector = None
 
     def get_std(self, log_std):
         if self.use_expln:
@@ -196,71 +197,103 @@ class StateDependentNoiseDistribution(Distribution):
 
     def sample_weights(self, log_std):
         self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std))
-        self.noise_weights = self.weights_dist.rsample()
+        self.exploration_mat = self.weights_dist.rsample()
 
     def proba_distribution_net(self, latent_dim, log_std_init=-1):
         mean_actions = nn.Linear(latent_dim, self.action_dim)
         # TODO: log_std_init depending on the number of layers?
-        log_std = nn.Parameter(th.ones(self.features_dim, self.action_dim) * log_std_init)
+        log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init)
         self.sample_weights(log_std)
         return mean_actions, log_std
 
-    def proba_distribution(self, mean_actions, log_std, observations, deterministic=False):
-        variance = th.mm(observations ** 2, self.get_std(log_std) ** 2)
+    def proba_distribution(self, mean_actions, log_std, latent_pi, deterministic=False):
+        # TODO: try without detach
+        variance = th.mm(latent_pi.detach() ** 2, self.get_std(log_std) ** 2)
         self.distribution = Normal(mean_actions, th.sqrt(variance))
 
         if deterministic:
             action = self.mode()
         else:
-            action = self.sample(observations)
+            action = self.sample(latent_pi)
         return action, self
 
     def mode(self):
-        self.gaussian_action = self.distribution.mean
-        if self.squash_output:
-            return th.tanh(self.gaussian_action)
-        return self.gaussian_action
+        action = self.distribution.mean
+        if self.bijector is not None:
+            return self.bijector.forward(action)
+        return action
 
-    def sample(self, observations):
-        noise = th.mm(observations, self.noise_weights)
-        self.gaussian_action = self.distribution.mean + noise
-        if self.squash_output:
-            return th.tanh(self.gaussian_action)
-        return self.gaussian_action
+    def sample(self, latent_pi):
+        noise = th.mm(latent_pi.detach(), self.exploration_mat)
+        action = self.distribution.mean + noise
+        if self.bijector is not None:
+            return self.bijector.forward(action)
+        return action
 
     def entropy(self):
         # TODO: account for the squashing?
         return self.distribution.entropy()
 
-    def log_prob_from_params(self, mean_actions, log_std, observations):
-        action, _ = self.proba_distribution(mean_actions, log_std, observations)
+    def log_prob_from_params(self, mean_actions, log_std, latent_pi):
+        action, _ = self.proba_distribution(mean_actions, log_std, latent_pi)
         log_prob = self.log_prob(action)
         return action, log_prob
 
     def log_prob(self, action):
-        if self.squash_output:
-            gaussian_action = self.gaussian_action
+        if self.bijector is not None:
+            gaussian_action = self.bijector.inverse(action)
         else:
             gaussian_action = action
         # log likelihood for a gaussian
         log_prob = self.distribution.log_prob(gaussian_action)
-        # log_prob = self.distribution.log_prob(action)
+
         if len(log_prob.shape) > 1:
             log_prob = log_prob.sum(axis=1)
         else:
             log_prob = log_prob.sum()
-        if self.squash_output:
+
+        if self.bijector is not None:
             # Squash correction (from original SAC implementation)
-            log_prob -= th.sum(th.log(1 - action ** 2 + self.epsilon), dim=1)
+            log_prob -= th.sum(self.bijector.log_prob_correction(gaussian_action), dim=1)
         return log_prob
 
 
-def make_proba_distribution(action_space, features_dim=None, use_sde=False):
+class TanhBijector(object):
+    def __init__(self, epsilon=1e-6):
+        super(TanhBijector, self).__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x):
+        return th.tanh(x)
+
+    def inverse(self, action):
+        """
+        Inverse tanh.
+
+        From https://github.com/tensorflow/agents:
+        0.99999997 is the maximum value such that atanh(x) is valid for both
+        float32 and float64
+
+        :param action: (th.Tensor)
+        :return: (th.Tensor)
+        """
+        # Inverse tanh
+        # Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x))
+        # We use numpy to avoid numerical instability
+        # Note: Using numpy, we do not keep the gradient
+        clipped_action = np.clip(action.cpu().numpy(), -0.99999997, 0.99999997)
+        return th.from_numpy(np.arctanh(clipped_action)).to(action.device)
+
+    def log_prob_correction(self, x):
+        # Squash correction (from original SAC implementation)
+        return th.log(1 - th.tanh(x) ** 2 + self.epsilon)
+
+
+def make_proba_distribution(action_space, use_sde=False):
     """
     Return an instance of Distribution for the correct type of action space
 
     :param action_space: (Gym Space) the input action space
-    :param feature_dim: (int) Dimension of the feature vector
     :param use_sde: (bool) Force the use of StateDependentNoiseDistribution
         instead of DiagGaussianDistribution
     :return: (Distribution) the approriate Distribution object
@@ -268,7 +301,7 @@ def make_proba_distribution(action_space, features_dim=None, use_sde=False):
     if isinstance(action_space, spaces.Box):
         assert len(action_space.shape) == 1, "Error: the action space must be a vector"
         if use_sde:
-            return StateDependentNoiseDistribution(features_dim, action_space.shape[0])
+            return StateDependentNoiseDistribution(action_space.shape[0])
         return DiagGaussianDistribution(action_space.shape[0])
     elif isinstance(action_space, spaces.Discrete):
         return CategoricalDistribution(action_space.n)
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index 25d0bf3..1e3b25c 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -125,7 +125,7 @@ class PPOPolicy(BasePolicy):
         self.features_dim = self.obs_dim
         self.log_std_init = log_std_init
         # Action distribution
-        self.action_dist = make_proba_distribution(action_space, self.features_dim, use_sde=use_sde)
+        self.action_dist = make_proba_distribution(action_space, use_sde=use_sde)
 
         self._build(learning_rate)
 
@@ -161,15 +161,15 @@ class PPOPolicy(BasePolicy):
             obs = th.FloatTensor(obs).to(self.device)
         latent_pi, latent_vf = self._get_latent(obs)
         value = self.value_net(latent_vf)
-        action, action_distribution = self._get_action_dist_from_latent(latent_pi, obs, deterministic=deterministic)
+        action, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
         log_prob = action_distribution.log_prob(action)
         return action, value, log_prob
 
     def _get_latent(self, obs):
         return self.mlp_extractor(self.features_extractor(obs))
 
-    def _get_action_dist_from_latent(self, latent, obs, deterministic=False):
-        mean_actions = self.action_net(latent)
+    def _get_action_dist_from_latent(self, latent_pi, deterministic=False):
+        mean_actions = self.action_net(latent_pi)
 
         if isinstance(self.action_dist, DiagGaussianDistribution):
             return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic)
@@ -178,16 +178,16 @@ class PPOPolicy(BasePolicy):
             return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic)
 
         elif isinstance(self.action_dist, StateDependentNoiseDistribution):
-            return self.action_dist.proba_distribution(mean_actions, self.log_std, obs, deterministic=deterministic)
+            return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_pi, deterministic=deterministic)
 
     def actor_forward(self, obs, deterministic=False):
         latent_pi, _ = self._get_latent(obs)
-        action, _ = self._get_action_dist_from_latent(latent_pi, obs, deterministic=deterministic)
+        action, _ = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
         return action.detach().cpu().numpy()
 
-    def get_policy_stats(self, obs, action):
+    def get_policy_stats(self, obs, action, deterministic=False):
         latent_pi, latent_vf = self._get_latent(obs)
-        _, action_distribution = self._get_action_dist_from_latent(latent_pi, obs)
+        _, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
         log_prob = action_distribution.log_prob(action)
         value = self.value_net(latent_vf)
         return value, log_prob, action_distribution.entropy()

From 72a6f18e4309c101da565e0b816cfdf6ea6f55b5 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 31 Oct 2019 14:14:30 +0100
Subject: [PATCH 10/19] Add sde test + fix random seed

---
 tests/test_sde.py                        | 27 ++++++++++++++++--------
 torchy_baselines/common/base_class.py    |  4 +++-
 torchy_baselines/common/distributions.py |  3 +--
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/tests/test_sde.py b/tests/test_sde.py
index 3c5db43..f397558 100644
--- a/tests/test_sde.py
+++ b/tests/test_sde.py
@@ -7,24 +7,22 @@ from torchy_baselines import A2C
 
 
 def test_state_dependent_exploration():
+    n_states = 2
     state_dim = 3
     # TODO: fix for action_dim > 1
     action_dim = 1
     sigma = th.ones(state_dim, action_dim, requires_grad=True)
 
-    # log_sigma = th.ones(2, 1, requires_grad=True)
-
     # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
     th.manual_seed(2)
     weights_dist = Normal(th.zeros_like(sigma), sigma)
 
     weights = weights_dist.rsample()
-    state = th.rand(1, state_dim)
-    # state = (th.ones(state_dim,) * 2).view(1, -1)
+    state = th.rand(n_states, state_dim)
     mu = th.ones(action_dim)
     # print(weights.shape, state.shape)
     noise = th.mm(state, weights)
-    # variance = th.mm(state ** 2, th.exp(log_sigma) ** 2)
+
     variance = th.mm(state ** 2, sigma ** 2)
     action_dist = Normal(mu, th.sqrt(variance))
 
@@ -35,7 +33,8 @@ def test_state_dependent_exploration():
     grad = th.zeros_like(sigma)
     for j in range(action_dim):
         for i in range(state_dim):
-            grad[i, j] = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j])
+            a = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j])
+            grad[i, j] = a.mean()
 
     # sigma.grad should be equal to grad
     assert sigma.grad.allclose(grad)
@@ -43,6 +42,16 @@ def test_state_dependent_exploration():
 
 @pytest.mark.parametrize("model_class", [A2C])
 def test_state_dependent_noise(model_class):
-    model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200,
-                        use_sde=True, ent_coef=0.0, verbose=1, create_eval_env=True)
-    model.learn(total_timesteps=int(1e6), log_interval=10, eval_freq=10000)
+    import gym
+    from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize
+    from torchy_baselines.common.monitor import Monitor
+
+    # env_id = 'Pendulum-v0'
+    env_id = 'MountainCarContinuous-v0'
+    # env_id = 'LunarLanderContinuous-v2'
+    env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True)
+    eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False)
+    model = model_class('MlpPolicy', env, n_steps=200, max_grad_norm=1, use_rms_prop=False,
+                        use_sde=True, ent_coef=0.00, verbose=1, create_eval_env=True, learning_rate=3e-4,
+                        policy_kwargs=dict(log_std_init=0.0, ortho_init=False, net_arch=[256, dict(pi=[256], vf=[256])]), seed=None)
+    model.learn(total_timesteps=int(20000), log_interval=5, eval_freq=10000, eval_env=eval_env)
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index 6adc45c..a6b9a41 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -282,7 +282,9 @@ class BaseRLModel(object):
         """
         raise NotImplementedError()
 
-    def set_random_seed(self, seed=0):
+    def set_random_seed(self, seed=None):
+        if seed is None:
+            return
         set_random_seed(seed, using_cuda=self.device == th.device('cuda'))
         self.action_space.seed(seed)
         if self.env is not None:
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index 72bf7b2..90105ee 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -199,9 +199,8 @@ class StateDependentNoiseDistribution(Distribution):
         self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std))
         self.exploration_mat = self.weights_dist.rsample()
 
-    def proba_distribution_net(self, latent_dim, log_std_init=-1):
+    def proba_distribution_net(self, latent_dim, log_std_init=0.0):
         mean_actions = nn.Linear(latent_dim, self.action_dim)
-        # TODO: log_std_init depending on the number of layers?
         log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init)
         self.sample_weights(log_std)
         return mean_actions, log_std

From 9644ae89cfa77f920a4f7c779595b3ebbb374c2a Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 31 Oct 2019 16:17:08 +0100
Subject: [PATCH 11/19] Log ppo std

---
 torchy_baselines/ppo/ppo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 8127307..bbf880b 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -253,6 +253,7 @@ class PPO(BaseRLModel):
         logger.logkv("entropy", entropy.mean().item())
         logger.logkv("policy_loss", policy_loss.item())
         logger.logkv("value_loss", value_loss.item())
+        logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
     def learn(self, total_timesteps, callback=None, log_interval=1,
               eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True):

From 0e092f7c528d86bdfa348f70899790edb65770d5 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 31 Oct 2019 16:59:35 +0100
Subject: [PATCH 12/19] Add plotting script

---
 torchy_baselines/a2c/a2c.py | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index d355e09..fed4290 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -64,6 +64,8 @@ class A2C(PPO):
         self.normalize_advantage = normalize_advantage
         self.rms_prop_eps = rms_prop_eps
         self.use_rms_prop = use_rms_prop
+        self.actions = []
+        self.states = []
 
         if _init_setup_model:
             self._setup_model()
@@ -125,12 +127,39 @@ class A2C(PPO):
         logger.logkv("value_loss", value_loss.item())
         logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
-        if self.use_sde:
-            pass
-            # print(th.exp(self.policy.log_std).detach())
+        self.states.append(self.rollout_buffer.observations.cpu().numpy())
+        self.actions.append(self.rollout_buffer.actions.cpu().numpy())
+
+        # Plot for MountainCarContinuous-v0
+        if True:
+            if len(self.actions) > 10:
+                import matplotlib.pyplot as plt
+                import numpy as np
+                actions = np.concatenate(self.actions)
+                x = np.arange(len(actions))
+                plt.figure("actions")
+                start = 0
+                for i in range(len(self.actions)):
+                    end = start + len(self.actions[i])
+                    # plt.plot(x[start:end], self.actions[i])
+                    # Clipped actions: real behavior, note that it is between [-2, 2] for the Pendulum
+                    plt.scatter(x[start:end], np.clip(self.actions[i], -1, 1), s=1)
+                    # plt.scatter(x[start:end], self.actions[i], s=1)
+                    start = end
+
+                plt.figure("states")
+                for i in range(len(self.states)):
+                    if len(self.states[i].shape) > 1:
+                        # plt.plot(self.states[i][:, 0], self.states[i][:, 1])
+                        plt.scatter(self.states[i][:, 0], self.states[i][:, 1], s=1)
+                    else:
+                        plt.scatter(x[start:end], self.states[i], s=1)
+
+                plt.show()
+                import ipdb; ipdb.set_trace()
 
 
-    def learn(self, total_timesteps, callback=None, log_interval=100,
+    def learn(self, total_timesteps, callback=None, log_interval=5,
               eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True):
 
         return super(A2C, self).learn(total_timesteps=total_timesteps, callback=callback, log_interval=log_interval,

From 9acff0f5b37ea57932279b774c17ad1030f9e74f Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 31 Oct 2019 17:01:27 +0100
Subject: [PATCH 13/19] Remove plotting script

---
 torchy_baselines/a2c/a2c.py | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index fed4290..8807c91 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -64,8 +64,6 @@ class A2C(PPO):
         self.normalize_advantage = normalize_advantage
         self.rms_prop_eps = rms_prop_eps
         self.use_rms_prop = use_rms_prop
-        self.actions = []
-        self.states = []
 
         if _init_setup_model:
             self._setup_model()
@@ -127,38 +125,6 @@ class A2C(PPO):
         logger.logkv("value_loss", value_loss.item())
         logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
-        self.states.append(self.rollout_buffer.observations.cpu().numpy())
-        self.actions.append(self.rollout_buffer.actions.cpu().numpy())
-
-        # Plot for MountainCarContinuous-v0
-        if True:
-            if len(self.actions) > 10:
-                import matplotlib.pyplot as plt
-                import numpy as np
-                actions = np.concatenate(self.actions)
-                x = np.arange(len(actions))
-                plt.figure("actions")
-                start = 0
-                for i in range(len(self.actions)):
-                    end = start + len(self.actions[i])
-                    # plt.plot(x[start:end], self.actions[i])
-                    # Clipped actions: real behavior, note that it is between [-2, 2] for the Pendulum
-                    plt.scatter(x[start:end], np.clip(self.actions[i], -1, 1), s=1)
-                    # plt.scatter(x[start:end], self.actions[i], s=1)
-                    start = end
-
-                plt.figure("states")
-                for i in range(len(self.states)):
-                    if len(self.states[i].shape) > 1:
-                        # plt.plot(self.states[i][:, 0], self.states[i][:, 1])
-                        plt.scatter(self.states[i][:, 0], self.states[i][:, 1], s=1)
-                    else:
-                        plt.scatter(x[start:end], self.states[i], s=1)
-
-                plt.show()
-                import ipdb; ipdb.set_trace()
-
-
     def learn(self, total_timesteps, callback=None, log_interval=5,
               eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True):
 

From 6c7c8375a47cd4ecaa53dcf8a3571cff57f81235 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 7 Nov 2019 11:16:59 +0100
Subject: [PATCH 14/19] Update log interval

---
 torchy_baselines/a2c/a2c.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 8807c91..9e6af14 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -125,7 +125,7 @@ class A2C(PPO):
         logger.logkv("value_loss", value_loss.item())
         logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
-    def learn(self, total_timesteps, callback=None, log_interval=5,
+    def learn(self, total_timesteps, callback=None, log_interval=100,
               eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True):
 
         return super(A2C, self).learn(total_timesteps=total_timesteps, callback=callback, log_interval=log_interval,

From c6f90b9c3c62f3f4e6869f49982d0989f15e6aa4 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 7 Nov 2019 11:17:26 +0100
Subject: [PATCH 15/19] Improve VecNormalize syncing for evaluation

---
 torchy_baselines/common/distributions.py |  2 +-
 torchy_baselines/ppo/ppo.py              | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index 90105ee..37f4cf8 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -180,6 +180,7 @@ class StateDependentNoiseDistribution(Distribution):
         self.exploration_mat = None
         self.use_expln = use_expln
         if squash_output:
+            print("== Using TanhBijector ===")
             self.bijector = TanhBijector(epsilon)
         else:
             self.bijector = None
@@ -206,7 +207,6 @@ class StateDependentNoiseDistribution(Distribution):
         return mean_actions, log_std
 
     def proba_distribution(self, mean_actions, log_std, latent_pi, deterministic=False):
-        # TODO: try without detach
         variance = th.mm(latent_pi.detach() ** 2, self.get_std(log_std) ** 2)
         self.distribution = Normal(mean_actions, th.sqrt(variance))
 
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index bbf880b..32e5f95 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -17,7 +17,7 @@ from torchy_baselines.common.base_class import BaseRLModel
 from torchy_baselines.common.evaluation import evaluate_policy
 from torchy_baselines.common.buffers import RolloutBuffer
 from torchy_baselines.common.utils import explained_variance, get_schedule_fn
-from torchy_baselines.common.vec_env import VecNormalize
+from torchy_baselines.common.vec_env import VecNormalize, VecEnvWrapper
 from torchy_baselines.common import logger
 from torchy_baselines.ppo.policies import PPOPolicy
 
@@ -294,9 +294,14 @@ class PPO(BaseRLModel):
             # Evaluate agent
             if 0 < eval_freq <= timesteps_since_eval and eval_env is not None:
                 timesteps_since_eval %= eval_freq
+                # TODO: move that to the base class
                 # Sync eval env and train env when using VecNormalize
-                if isinstance(self.env, VecNormalize):
-                    eval_env.obs_rms = deepcopy(self.env.obs_rms)
+                env_tmp, eval_env_tmp = self.env, eval_env
+                while isinstance(env_tmp, VecEnvWrapper):
+                    if isinstance(env_tmp, VecNormalize):
+                        eval_env_tmp.obs_rms = deepcopy(env_tmp.obs_rms)
+                    env_tmp = env_tmp.venv
+                    eval_env_tmp.venv
                 mean_reward, _ = evaluate_policy(self, eval_env, n_eval_episodes)
                 if self.tb_writer is not None:
                     self.tb_writer.add_scalar('Eval/reward', mean_reward, self.num_timesteps)

From 95c741c7073ce2a318af769e765c2f6cf7be4f3b Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 7 Nov 2019 17:01:02 +0100
Subject: [PATCH 16/19] Fix logger for discrete actions

---
 torchy_baselines/a2c/a2c.py | 3 ++-
 torchy_baselines/ppo/ppo.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 9e6af14..41ca60c 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -123,7 +123,8 @@ class A2C(PPO):
         logger.logkv("entropy", entropy.mean().item())
         logger.logkv("policy_loss", policy_loss.item())
         logger.logkv("value_loss", value_loss.item())
-        logger.logkv("std", th.exp(self.policy.log_std).mean().item())
+        if hasattr(self.policy, 'log_std'):
+            logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
     def learn(self, total_timesteps, callback=None, log_interval=100,
               eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True):
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 32e5f95..2b2a5e6 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -253,7 +253,8 @@ class PPO(BaseRLModel):
         logger.logkv("entropy", entropy.mean().item())
         logger.logkv("policy_loss", policy_loss.item())
         logger.logkv("value_loss", value_loss.item())
-        logger.logkv("std", th.exp(self.policy.log_std).mean().item())
+        if hasattr(self.policy, 'log_std'):
+            logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
     def learn(self, total_timesteps, callback=None, log_interval=1,
               eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True):

From 5d353d598ca53c0a20a859f66f4f2a05ea41627c Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Mon, 18 Nov 2019 14:09:31 +0100
Subject: [PATCH 17/19] Start cleanup + update docstrings

---
 tests/test_sde.py                        | 25 +++++++++++---------
 torchy_baselines/common/distributions.py | 29 ++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/tests/test_sde.py b/tests/test_sde.py
index f397558..03c8f62 100644
--- a/tests/test_sde.py
+++ b/tests/test_sde.py
@@ -1,17 +1,25 @@
 import pytest
 
+import gym
 import torch as th
 from torch.distributions import Normal
 
 from torchy_baselines import A2C
+from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize
+from torchy_baselines.common.monitor import Monitor
 
 
 def test_state_dependent_exploration():
+    """
+    Check that the gradient correspond to the expected one
+    """
     n_states = 2
     state_dim = 3
     # TODO: fix for action_dim > 1
     action_dim = 1
-    sigma = th.ones(state_dim, action_dim, requires_grad=True)
+    sigma = th.ones(state_dim, 1, requires_grad=True)
+    # Reduce the number of parameters
+    # sigma_ = th.ones(state_dim, action_dim) * sigma_
 
     # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
     th.manual_seed(2)
@@ -42,16 +50,11 @@ def test_state_dependent_exploration():
 
 @pytest.mark.parametrize("model_class", [A2C])
 def test_state_dependent_noise(model_class):
-    import gym
-    from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize
-    from torchy_baselines.common.monitor import Monitor
-
-    # env_id = 'Pendulum-v0'
     env_id = 'MountainCarContinuous-v0'
-    # env_id = 'LunarLanderContinuous-v2'
+
     env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True)
     eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False)
-    model = model_class('MlpPolicy', env, n_steps=200, max_grad_norm=1, use_rms_prop=False,
-                        use_sde=True, ent_coef=0.00, verbose=1, create_eval_env=True, learning_rate=3e-4,
-                        policy_kwargs=dict(log_std_init=0.0, ortho_init=False, net_arch=[256, dict(pi=[256], vf=[256])]), seed=None)
-    model.learn(total_timesteps=int(20000), log_interval=5, eval_freq=10000, eval_env=eval_env)
+
+    model = model_class('MlpPolicy', env, n_steps=200, use_sde=True, ent_coef=0.00, verbose=1, learning_rate=3e-4,
+                        policy_kwargs=dict(log_std_init=0.0, ortho_init=False), seed=None)
+    model.learn(total_timesteps=int(1000), log_interval=5, eval_freq=500, eval_env=eval_env)
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index 37f4cf8..82a07a2 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -45,6 +45,11 @@ class Distribution(object):
 
 
 class DiagGaussianDistribution(Distribution):
+    """
+    Gaussian distribution with diagonal covariance matrix.
+
+    :param action_dim: (int) Number of actions
+    """
     def __init__(self, action_dim):
         super(DiagGaussianDistribution, self).__init__()
         self.distribution = None
@@ -53,12 +58,28 @@ class DiagGaussianDistribution(Distribution):
         self.log_std = None
 
     def proba_distribution_net(self, latent_dim, log_std_init=0.0):
+        """
+        Create the layers and parameter that represent the distribution:
+        one output will be the mean of the gaussian, the other parameter will be the
+        standard deviation (log std in fact to allow negative values)
+
+        :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer)
+        :param log_std_init: (float) Initial value for the log standard deviation
+        """
         mean_actions = nn.Linear(latent_dim, self.action_dim)
         # TODO: allow action dependent std
         log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init)
         return mean_actions, log_std
 
     def proba_distribution(self, mean_actions, log_std, deterministic=False):
+        """
+        Create and sample for the distribution given its parameters (mean, std)
+
+        :param mean_actions: (th.Tensor)
+        :param log_std: (th.Tensor)
+        :param deterministic: (bool)
+        :return: (th.Tensor)
+        """
         action_std = th.ones_like(mean_actions) * log_std.exp()
         self.distribution = Normal(mean_actions, action_std)
         if deterministic:
@@ -77,6 +98,14 @@ class DiagGaussianDistribution(Distribution):
         return self.distribution.entropy()
 
     def log_prob_from_params(self, mean_actions, log_std):
+        """
+        Compute the log probabilty of taking an action
+        given the distribution parameters.
+
+        :param mean_actions: (th.Tensor)
+        :param log_std: (th.Tensor)
+        :return: (th.Tensor, th.Tensor)
+        """
         action, _ = self.proba_distribution(mean_actions, log_std)
         log_prob = self.log_prob(action)
         return action, log_prob

From b9c20d443d7d0d719ba59635e9357f8a53ed98e9 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Mon, 18 Nov 2019 15:04:07 +0100
Subject: [PATCH 18/19] Update doc + add test for tanh bijector

---
 tests/test_distributions.py              |  20 ++++
 torchy_baselines/common/distributions.py | 122 +++++++++++++++++++----
 2 files changed, 124 insertions(+), 18 deletions(-)
 create mode 100644 tests/test_distributions.py

diff --git a/tests/test_distributions.py b/tests/test_distributions.py
new file mode 100644
index 0000000..47651e4
--- /dev/null
+++ b/tests/test_distributions.py
@@ -0,0 +1,20 @@
+import numpy as np
+import torch as th
+
+from torchy_baselines.common.distributions import DiagGaussianDistribution, SquashedDiagGaussianDistribution,\
+    CategoricalDistribution, TanhBijector
+
+# TODO: more tests for the other distributions
+def test_bijector():
+    """
+    Test TanhBijector
+    """
+    actions = th.ones(5) * 2.0
+
+    bijector = TanhBijector()
+
+    squashed_actions = bijector.forward(actions)
+    # Check that the boundaries are not violated
+    assert th.max(th.abs(squashed_actions)) <= 1.0
+    # Check the inverse method
+    assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all()
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index 82a07a2..9d67b39 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -46,9 +46,10 @@ class Distribution(object):
 
 class DiagGaussianDistribution(Distribution):
     """
-    Gaussian distribution with diagonal covariance matrix.
+    Gaussian distribution with diagonal covariance matrix,
+    for continuous actions.
 
-    :param action_dim: (int) Number of actions
+    :param action_dim: (int)  Number of continuous actions
     """
     def __init__(self, action_dim):
         super(DiagGaussianDistribution, self).__init__()
@@ -65,6 +66,7 @@ class DiagGaussianDistribution(Distribution):
 
         :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer)
         :param log_std_init: (float) Initial value for the log standard deviation
+        :return: (nn.Linear, nn.Parameter)
         """
         mean_actions = nn.Linear(latent_dim, self.action_dim)
         # TODO: allow action dependent std
@@ -111,6 +113,14 @@ class DiagGaussianDistribution(Distribution):
         return action, log_prob
 
     def log_prob(self, action):
+        """
+        Get the log probabilty of an action given a distribution.
+        Note that you must call `proba_distribution()` method
+        before.
+
+        :param action: (th.Tensor)
+        :return: (th.Tensor)
+        """
         log_prob = self.distribution.log_prob(action)
         if len(log_prob.shape) > 1:
             log_prob = log_prob.sum(axis=1)
@@ -120,6 +130,13 @@ class DiagGaussianDistribution(Distribution):
 
 
 class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
+    """
+    Gaussian distribution with diagonal covariance matrix,
+    followed by a squashing function (tanh) to ensure bounds.
+
+    :param action_dim: (int) Number of continuous actions
+    :param epsilon: (float) small value to avoid NaN due to numerical imprecision.
+    """
     def __init__(self, action_dim, epsilon=1e-6):
         super(SquashedDiagGaussianDistribution, self).__init__(action_dim)
         # Avoid NaN (prevents division by zero or log of zero)
@@ -146,27 +163,40 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
 
     def log_prob(self, action, gaussian_action=None):
         # Inverse tanh
-        # Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x))
+        # Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x))
         # We use numpy to avoid numerical instability
         if gaussian_action is None:
-            # Clip to avoid NaN
-            clipped_action = np.clip(action.cpu().numpy(), -1.0 + self.epsilon, 1.0 + self.epsilon)
-            gaussian_action = th.from_numpy(np.arctanh(clipped_action)).to(action.device)
+            # It will be clipped to avoid NaN when inversing tanh
+            gaussian_action = TanhBijector.inverse(action)
 
         # Log likelihood for a gaussian distribution
         log_prob = super(SquashedDiagGaussianDistribution, self).log_prob(gaussian_action)
         # Squash correction (from original SAC implementation)
+        # this comes from the fact that tanh is bijective and differentiable
         log_prob -= th.sum(th.log(1 - action ** 2 + self.epsilon), dim=1)
         return log_prob
 
 
 class CategoricalDistribution(Distribution):
+    """
+    Categorical distribution for discrete actions.
+
+    :param action_dim: (int) Number of discrete actions
+    """
     def __init__(self, action_dim):
         super(CategoricalDistribution, self).__init__()
         self.distribution = None
         self.action_dim = action_dim
 
     def proba_distribution_net(self, latent_dim):
+        """
+        Create the layer that represents the distribution:
+        it will be the logits of the Categorical distribution.
+        You can then get probabilties using a softmax.
+
+        :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer)
+        :return: (nn.Linear)
+        """
         action_logits = nn.Linear(latent_dim, self.action_dim)
         return action_logits
 
@@ -198,6 +228,19 @@ class CategoricalDistribution(Distribution):
 
 
 class StateDependentNoiseDistribution(Distribution):
+    """
+    Distribution class for using State Dependent Exploration (SDE).
+    It is used to create the noise exploration matrix and
+    compute the log probabilty of an action with that noise.
+
+    :param action_dim: (int) Number of continuous actions
+    :param use_expln: (bool) Use `expln()` function instead of `exp()` to ensure
+        a positive standard deviation (cf paper). It allows to keep variance
+        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+    :param squash_output: (bool) Whether to squash the output using a tanh function,
+        this allows to ensure boundaries.
+    :param epsilon: (float) small value to avoid NaN due to numerical imprecision.
+    """
     def __init__(self, action_dim, use_expln=False,
                  squash_output=False, epsilon=1e-6):
         super(StateDependentNoiseDistribution, self).__init__()
@@ -215,6 +258,13 @@ class StateDependentNoiseDistribution(Distribution):
             self.bijector = None
 
     def get_std(self, log_std):
+        """
+        Get the standard deviation from the learned parameter
+        (log of it by default). This ensures that the std is positive.
+
+        :param log_std: (th.Tensor)
+        :return: (th.Tensor)
+        """
         if self.use_expln:
             # From SDE paper, it allows to keep variance
             # above zero and prevent it from growing too fast
@@ -223,19 +273,44 @@ class StateDependentNoiseDistribution(Distribution):
             else:
                 return th.log(log_std + 1.0) + 1.0
         else:
+            # Use normal exponential
             return th.exp(log_std)
 
     def sample_weights(self, log_std):
+        """
+        Sample weights for the noise exploration matrix,
+        using a centered gaussian distribution.
+
+        :param log_std: (th.Tensor)
+        """
+        # TODO: reduce the number of learned dimensions (cf TD3)
         self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std))
         self.exploration_mat = self.weights_dist.rsample()
 
     def proba_distribution_net(self, latent_dim, log_std_init=0.0):
+        """
+        Create the layers and parameter that represent the distribution:
+        one output will be the deterministic action, the other parameter will be the
+        standard deviation of the distribution that control the weights of the noise matrix.
+
+        :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer)
+        :param log_std_init: (float) Initial value for the log standard deviation
+        :return: (nn.Linear, nn.Parameter)
+        """
         mean_actions = nn.Linear(latent_dim, self.action_dim)
         log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init)
         self.sample_weights(log_std)
         return mean_actions, log_std
 
     def proba_distribution(self, mean_actions, log_std, latent_pi, deterministic=False):
+        """
+        Create and sample for the distribution given its parameters (mean, std)
+
+        :param mean_actions: (th.Tensor)
+        :param log_std: (th.Tensor)
+        :param deterministic: (bool)
+        :return: (th.Tensor)
+        """
         variance = th.mm(latent_pi.detach() ** 2, self.get_std(log_std) ** 2)
         self.distribution = Normal(mean_actions, th.sqrt(variance))
 
@@ -287,6 +362,13 @@ class StateDependentNoiseDistribution(Distribution):
 
 
 class TanhBijector(object):
+    """
+    Bijective transformation of a probabilty distribution
+    using a squashing function (tanh)
+    TODO: use Pyro instead (https://pyro.ai/)
+
+    :param epsilon: (float) small value to avoid NaN due to numerical imprecision.
+    """
     def __init__(self, epsilon=1e-6):
         super(TanhBijector, self).__init__()
         self.epsilon = epsilon
@@ -294,23 +376,27 @@ class TanhBijector(object):
     def forward(self, x):
         return th.tanh(x)
 
-    def inverse(self, action):
+    @staticmethod
+    def atanh(x):
+        """
+        Inverse of Tanh
+
+        Taken from pyro: https://github.com/pyro-ppl/pyro
+        0.5 * torch.log((1 + x ) / (1 - x))
+        """
+        return 0.5 * (x.log1p() - (-x).log1p())
+
+    @staticmethod
+    def inverse(y):
         """
         Inverse tanh.
 
-        From https://github.com/tensorflow/agents:
-        0.99999997 is the maximum value such that atanh(x) is valid for both
-        float32 and float64
-
-        :param action: (th.Tensor)
+        :param y: (th.Tensor)
         :return: (th.Tensor)
         """
-        # Inverse tanh
-        # Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x))
-        # We use numpy to avoid numerical instability
-        # Note: Using numpy, we do not keep the gradient
-        clipped_action = np.clip(action.cpu().numpy(), -0.99999997, 0.99999997)
-        return th.from_numpy(np.arctanh(clipped_action)).to(action.device)
+        eps = th.finfo(y.dtype).eps
+        # Clip the action to avoid NaN
+        return TanhBijector.atanh(y.clamp(min=-1. + eps, max=1. - eps))
 
     def log_prob_correction(self, x):
         # Squash correction (from original SAC implementation)

From ef59a7e431293a5904197f51578b34c4db4e38b3 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Mon, 18 Nov 2019 15:11:19 +0100
Subject: [PATCH 19/19] Update version + add docstring

---
 setup.py                              | 2 +-
 torchy_baselines/common/base_class.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 89a3b57..f40a7da 100644
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ setup(name='torchy_baselines',
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.0.4",
+      version="0.0.5a",
       )
 
 # python setup.py sdist
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index a6b9a41..89b3e98 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -283,6 +283,12 @@ class BaseRLModel(object):
         raise NotImplementedError()
 
     def set_random_seed(self, seed=None):
+        """
+        Set the seed of the pseudo-random generators
+        (python, numpy, pytorch, gym, action_space)
+
+        :param seed: (int)
+        """
         if seed is None:
             return
         set_random_seed(seed, using_cuda=self.device == th.device('cuda'))