From c15b4bda1ef8e4dd821b758ef7532165ab82c487 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 28 Oct 2019 18:24:13 +0100 Subject: [PATCH 01/19] Add first draft of SDE --- tests/test_sde.py | 47 +++++++++++++++++ torchy_baselines/a2c/a2c.py | 9 +++- torchy_baselines/common/distributions.py | 67 +++++++++++++++++++++++- torchy_baselines/ppo/policies.py | 37 +++++++++---- torchy_baselines/ppo/ppo.py | 12 ++++- 5 files changed, 156 insertions(+), 16 deletions(-) create mode 100644 tests/test_sde.py diff --git a/tests/test_sde.py b/tests/test_sde.py new file mode 100644 index 0000000..7874ae9 --- /dev/null +++ b/tests/test_sde.py @@ -0,0 +1,47 @@ +import pytest + +import torch as th +from torch.distributions import Normal + +from torchy_baselines import A2C + + +def test_state_dependent_exploration(): + state_dim = 3 + # TODO: fix for action_dim > 1 + action_dim = 1 + sigma = th.ones(state_dim, action_dim, requires_grad=True) + + # log_sigma = th.ones(2, 1, requires_grad=True) + + # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma)) + th.manual_seed(2) + weights_dist = Normal(th.zeros_like(sigma), sigma) + + weights = weights_dist.rsample() + state = th.rand(1, state_dim) + # state = (th.ones(state_dim,) * 2).view(1, -1) + mu = th.ones(action_dim) + # print(weights.shape, state.shape) + noise = th.mm(state, weights) + # variance = th.mm(state ** 2, th.exp(log_sigma) ** 2) + variance = th.mm(state ** 2, sigma ** 2) + action_dist = Normal(mu, th.sqrt(variance)) + + loss = action_dist.log_prob((mu + noise).detach()).mean() + loss.backward() + + # From Rueckstiess paper + grad = th.zeros_like(sigma) + for j in range(action_dim): + for i in range(state_dim): + grad[i, j] = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j]) + + # sigma.grad should be equal to grad + assert sigma.grad.allclose(grad) + + +@pytest.mark.parametrize("model_class", [A2C]) +def test_state_dependent_noise(model_class): + model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200, use_sde=True, verbose=1, create_eval_env=True) + model.learn(total_timesteps=int(1e6), log_interval=10, eval_freq=10000) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 6ee6f4a..3aa1589 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from torchy_baselines.common.utils import explained_variance from torchy_baselines.ppo.ppo import PPO from torchy_baselines.ppo.policies import PPOPolicy +from torchy_baselines.common import logger class A2C(PPO): @@ -30,6 +31,8 @@ class A2C(PPO): :param rms_prop_eps: (float) RMSProp epsilon. It stabilizes square root computation in denominator of RMSProp update :param use_rms_prop: (bool) Whether to use RMSprop (default) or Adam as optimizer + :param use_sde: (bool) Whether to use State Dependent Exploration (SDE) + instead of action noise exploration (default: False) :param normalize_advantage: (bool) Whether to normalize or not the advantage :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param create_eval_env: (bool) Whether to create a second environment that will be @@ -45,7 +48,7 @@ class A2C(PPO): def __init__(self, policy, env, learning_rate=7e-4, n_steps=5, gamma=0.99, gae_lambda=1.0, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, - rms_prop_eps=1e-5, use_rms_prop=True, + rms_prop_eps=1e-5, use_rms_prop=True, use_sde=False, normalize_advantage=False, tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=0, seed=0, device='auto', _init_setup_model=True): @@ -53,7 +56,7 @@ class A2C(PPO): super(A2C, self).__init__(policy, env, learning_rate=learning_rate, n_steps=n_steps, batch_size=None, n_epochs=1, gamma=gamma, gae_lambda=gae_lambda, ent_coef=ent_coef, - vf_coef=vf_coef, max_grad_norm=max_grad_norm, + vf_coef=vf_coef, max_grad_norm=max_grad_norm, use_sde=use_sde, tensorboard_log=tensorboard_log, policy_kwargs=policy_kwargs, verbose=verbose, device=device, create_eval_env=create_eval_env, seed=seed, _init_setup_model=False) @@ -73,6 +76,8 @@ class A2C(PPO): eps=self.rms_prop_eps, weight_decay=0) def train(self, gradient_steps, batch_size=None): + if self.use_sde: + logger.logkv("noise net std", th.exp(self.policy.log_std).mean().item()) # Update optimizer learning rate self._update_learning_rate(self.policy.optimizer) diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index a384ce3..d420e03 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -165,15 +165,80 @@ class CategoricalDistribution(Distribution): return log_prob -def make_proba_distribution(action_space): +class StateDependentNoiseDistribution(Distribution): + def __init__(self, features_dim, action_dim): + super(StateDependentNoiseDistribution, self).__init__() + self.distribution = None + self.action_dim = action_dim + self.features_dim = features_dim + self.mean_actions = None + self.log_std = None + self.weights_dist = None + self.noise_weights = None + + @staticmethod + def get_std(log_std): + # TODO: use expln instead of exp only to avoid sigma growing too fast + return th.exp(log_std) + + def sample_weights(self, log_std): + self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std)) + self.noise_weights = self.weights_dist.rsample() + + def proba_distribution_net(self, latent_dim, log_std_init=0.0): + mean_actions = nn.Linear(latent_dim, self.action_dim) + log_std = nn.Parameter(th.zeros(self.features_dim, self.action_dim)) + self.sample_weights(log_std) + return mean_actions, log_std + + def proba_distribution(self, mean_actions, log_std, observations, deterministic=False): + variance = th.mm(observations ** 2, self.get_std(log_std) ** 2) + self.distribution = Normal(mean_actions, th.sqrt(variance)) + + if deterministic: + action = self.mode() + else: + action = self.sample(observations) + return action, self + + def mode(self): + return self.distribution.mean + + def sample(self, observations): + noise = th.mm(observations, self.noise_weights) + return self.distribution.mean + noise + + def entropy(self): + return self.distribution.entropy() + + def log_prob_from_params(self, mean_actions, log_std, observations): + action, _ = self.proba_distribution(mean_actions, log_std, observations) + log_prob = self.log_prob(action) + return action, log_prob + + def log_prob(self, action): + log_prob = self.distribution.log_prob(action) + if len(log_prob.shape) > 1: + log_prob = log_prob.sum(axis=1) + else: + log_prob = log_prob.sum() + return log_prob + + +def make_proba_distribution(action_space, features_dim=None, use_sde=False): """ Return an instance of Distribution for the correct type of action space :param action_space: (Gym Space) the input action space + :param feature_dim: (int) Dimension of the feature vector + :param use_sde: (bool) Force the use of StateDependentNoiseDistribution + instead of DiagGaussianDistribution :return: (Distribution) the approriate Distribution object """ if isinstance(action_space, spaces.Box): assert len(action_space.shape) == 1, "Error: the action space must be a vector" + if use_sde: + return StateDependentNoiseDistribution(features_dim, action_space.shape[0]) return DiagGaussianDistribution(action_space.shape[0]) elif isinstance(action_space, spaces.Discrete): return CategoricalDistribution(action_space.n) diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index e973858..08dd7dd 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -6,7 +6,8 @@ import torch.nn as nn import numpy as np from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp -from torchy_baselines.common.distributions import make_proba_distribution, DiagGaussianDistribution, CategoricalDistribution +from torchy_baselines.common.distributions import make_proba_distribution,\ + DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution class MlpExtractor(nn.Module): @@ -101,7 +102,8 @@ class MlpExtractor(nn.Module): class PPOPolicy(BasePolicy): def __init__(self, observation_space, action_space, learning_rate, net_arch=None, device='cpu', - activation_fn=nn.Tanh, adam_epsilon=1e-5, ortho_init=True): + activation_fn=nn.Tanh, adam_epsilon=1e-5, + ortho_init=True, use_sde=False): super(PPOPolicy, self).__init__(observation_space, action_space, device) self.obs_dim = self.observation_space.shape[0] if net_arch is None: @@ -118,20 +120,31 @@ class PPOPolicy(BasePolicy): } self.shared_net = None self.pi_net, self.vf_net = None, None - # Action distribution - self.action_dist = make_proba_distribution(action_space) # In the future, feature_extractor will be replaced with a CNN self.features_extractor = nn.Flatten() self.features_dim = self.obs_dim + # Action distribution + self.action_dist = make_proba_distribution(action_space, self.features_dim, use_sde=use_sde) + self._build(learning_rate) + def reset_noise_net(self): + self.action_dist.sample_weights(self.log_std) + # weights_dist = Normal(th.zeros_like(self.noise_log_sigma), th.exp(self.noise_log_sigma)) + # self.noise_net = weights_dist.rsample() + # noise = th.mm(state, weights) + # variance = th.mm(state ** 2, sigma ** 2) + # action_dist = Normal(mu, th.sqrt(variance)) + # # action_dist.log_prob((mu + noise).detach()) + # action_dist.log_prob(action) + # # action_dist = Normal(mu_j + noise_j, sum of s_i * sigma_ij) + # # log_prob = distribution.log_prob(self.noise_net) + def _build(self, learning_rate): self.mlp_extractor = MlpExtractor(self.features_dim, net_arch=self.net_arch, activation_fn=self.activation_fn, device=self.device) - # self.action_net = nn.Linear(self.net_arch[-1], self.action_dim) - # self.log_std = nn.Parameter(th.zeros(self.action_dim)) - if isinstance(self.action_dist, DiagGaussianDistribution): + if isinstance(self.action_dist, (DiagGaussianDistribution, StateDependentNoiseDistribution)): self.action_net, self.log_std = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi) elif isinstance(self.action_dist, CategoricalDistribution): self.action_net = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi) @@ -155,28 +168,30 @@ class PPOPolicy(BasePolicy): obs = th.FloatTensor(obs).to(self.device) latent_pi, latent_vf = self._get_latent(obs) value = self.value_net(latent_vf) - action, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic) + action, action_distribution = self._get_action_dist_from_latent(latent_pi, obs, deterministic=deterministic) log_prob = action_distribution.log_prob(action) return action, value, log_prob def _get_latent(self, obs): return self.mlp_extractor(self.features_extractor(obs)) - def _get_action_dist_from_latent(self, latent, deterministic=False): + def _get_action_dist_from_latent(self, latent, obs, deterministic=False): mean_actions = self.action_net(latent) if isinstance(self.action_dist, DiagGaussianDistribution): return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic) elif isinstance(self.action_dist, CategoricalDistribution): return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic) + elif isinstance(self.action_dist, StateDependentNoiseDistribution): + return self.action_dist.proba_distribution(mean_actions, self.log_std, obs, deterministic=deterministic) def actor_forward(self, obs, deterministic=False): latent_pi, _ = self._get_latent(obs) - action, _ = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic) + action, _ = self._get_action_dist_from_latent(latent_pi, obs, deterministic=deterministic) return action.detach().cpu().numpy() def get_policy_stats(self, obs, action): latent_pi, latent_vf = self._get_latent(obs) - _, action_distribution = self._get_action_dist_from_latent(latent_pi) + _, action_distribution = self._get_action_dist_from_latent(latent_pi, obs) log_prob = action_distribution.log_prob(action) value = self.value_net(latent_vf) return value, log_prob, action_distribution.entropy() diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 13a1634..7fa318d 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -52,6 +52,8 @@ class PPO(BaseRLModel): :param ent_coef: (float) Entropy coefficient for the loss calculation :param vf_coef: (float) Value function coefficient for the loss calculation :param max_grad_norm: (float) The maximum value for the gradient clipping + :param use_sde: (bool) Whether to use State Dependent Exploration (SDE) + instead of action noise exploration (default: False) :param target_kl: (float) Limit the KL divergence between updates, because the clipping is not enough to prevent large update see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213) @@ -70,7 +72,7 @@ class PPO(BaseRLModel): def __init__(self, policy, env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, clip_range_vf=None, - ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, + ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, use_sde=False, target_kl=None, tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=0, seed=0, device='auto', _init_setup_model=True): @@ -94,6 +96,7 @@ class PPO(BaseRLModel): self.target_kl = target_kl self.tensorboard_log = tensorboard_log self.tb_writer = None + self.use_sde = use_sde if _init_setup_model: self._setup_model() @@ -116,7 +119,8 @@ class PPO(BaseRLModel): self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device, gamma=self.gamma, gae_lambda=self.gae_lambda, n_envs=self.n_envs) self.policy = self.policy(self.observation_space, self.action_space, - self.learning_rate, device=self.device, **self.policy_kwargs) + self.learning_rate, use_sde=self.use_sde, device=self.device, + **self.policy_kwargs) self.policy = self.policy.to(self.device) self.clip_range = get_schedule_fn(self.clip_range) @@ -150,6 +154,10 @@ class PPO(BaseRLModel): n_steps = 0 rollout_buffer.reset() + # Sample new weights for the state dependent exploration + # TODO: ensure episodic setting? + if self.use_sde: + self.policy.reset_noise_net() while n_steps < n_rollout_steps: with th.no_grad(): From 69a348276edfda1eabbde6e9b8ec29ea1a2f71bf Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 29 Oct 2019 12:36:40 +0100 Subject: [PATCH 02/19] Add classic advantage computation --- torchy_baselines/common/buffers.py | 48 +++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/torchy_baselines/common/buffers.py b/torchy_baselines/common/buffers.py index 34a7098..7ca61b8 100644 --- a/torchy_baselines/common/buffers.py +++ b/torchy_baselines/common/buffers.py @@ -113,22 +113,42 @@ class RolloutBuffer(BaseBuffer): self.generator_ready = False super(RolloutBuffer, self).reset() - def compute_returns_and_advantage(self, last_value, dones=False): + def compute_returns_and_advantage(self, last_value, dones=False, use_gae=True): """ - From PPO2 + From Stable-Baselines PPO2 + :param last_value: (th.Tensor) + :param dones: ([bool]) + :param use_gae: (bool) Whether to use Generalized Advantage Estimation + or normal advantage for advantage computation. """ - last_gae_lam = 0 - for step in reversed(range(self.buffer_size)): - if step == self.buffer_size - 1: - next_non_terminal = th.FloatTensor(1.0 - dones) - next_value = last_value.clone().cpu().flatten() - else: - next_non_terminal = 1.0 - self.dones[step + 1] - next_value = self.values[step + 1] - delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step] - last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam - self.advantages[step] = last_gae_lam - self.returns = self.advantages + self.values + if use_gae: + last_gae_lam = 0 + for step in reversed(range(self.buffer_size)): + if step == self.buffer_size - 1: + next_non_terminal = th.FloatTensor(1.0 - dones) + next_value = last_value.clone().cpu().flatten() + else: + next_non_terminal = 1.0 - self.dones[step + 1] + next_value = self.values[step + 1] + delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step] + last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam + self.advantages[step] = last_gae_lam + self.returns = self.advantages + self.values + else: + # Discounted return with value bootstrap + # Note: this is equivalent to GAE computation + # with gae_lambda = 1.0 + last_return = 0.0 + for step in reversed(range(self.buffer_size)): + if step == self.buffer_size - 1: + next_non_terminal = th.FloatTensor(1.0 - dones) + next_value = last_value.clone().cpu().flatten() + last_return = self.rewards[step] + next_non_terminal * next_value + else: + next_non_terminal = 1.0 - self.dones[step + 1] + last_return = self.rewards[step] + self.gamma * last_return * next_non_terminal + self.returns[step] = last_return + self.advantages = self.returns - self.values def add(self, obs, action, reward, done, value, log_prob): if len(log_prob.shape) == 0: From 0d41bc13560ac8ed010d8bcc6407d3318a494646 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 29 Oct 2019 15:15:11 +0100 Subject: [PATCH 03/19] Add more logging --- torchy_baselines/a2c/a2c.py | 18 ++++++++++++------ torchy_baselines/ppo/ppo.py | 10 ++++++++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 3aa1589..6b51227 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -76,9 +76,6 @@ class A2C(PPO): eps=self.rms_prop_eps, weight_decay=0) def train(self, gradient_steps, batch_size=None): - if self.use_sde: - logger.logkv("noise net std", th.exp(self.policy.log_std).mean().item()) - # Update optimizer learning rate self._update_learning_rate(self.policy.optimizer) # A2C with gradient_steps > 1 does not make sense @@ -118,10 +115,19 @@ class A2C(PPO): # Clip grad norm th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) self.policy.optimizer.step() - # approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy()) - # print(explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(), - # self.rollout_buffer.values.flatten().cpu().numpy())) + explained_var = explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(), + self.rollout_buffer.values.flatten().cpu().numpy()) + + logger.logkv("explained_variance", explained_var) + logger.logkv("entropy", entropy.mean().item()) + logger.logkv("policy_loss", policy_loss.item()) + logger.logkv("value_loss", value_loss.item()) + + if self.use_sde: + logger.logkv("noise net std", th.exp(self.policy.log_std).mean().item()) + # print(th.exp(self.policy.log_std).detach()) + def learn(self, total_timesteps, callback=None, log_interval=100, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True): diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 7fa318d..8171eb5 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -245,8 +245,14 @@ class PPO(BaseRLModel): print("Early stopping at step {} due to reaching max kl: {:.2f}".format(it, np.mean(approx_kl_divs))) break - # print(explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(), - # self.rollout_buffer.values.flatten().cpu().numpy())) + explained_var = explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(), + self.rollout_buffer.values.flatten().cpu().numpy()) + + logger.logkv("explained_variance", explained_var) + # TODO: gather stats for the entropy and other losses? + logger.logkv("entropy", entropy.mean().item()) + logger.logkv("policy_loss", policy_loss.item()) + logger.logkv("value_loss", value_loss.item()) def learn(self, total_timesteps, callback=None, log_interval=1, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True): From 42d50ed09b29c1ca2b6870deac42a22f7ba95ee2 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 29 Oct 2019 15:15:54 +0100 Subject: [PATCH 04/19] Add expln --- tests/test_sde.py | 3 ++- torchy_baselines/common/distributions.py | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/test_sde.py b/tests/test_sde.py index 7874ae9..3c5db43 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -43,5 +43,6 @@ def test_state_dependent_exploration(): @pytest.mark.parametrize("model_class", [A2C]) def test_state_dependent_noise(model_class): - model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200, use_sde=True, verbose=1, create_eval_env=True) + model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200, + use_sde=True, ent_coef=0.0, verbose=1, create_eval_env=True) model.learn(total_timesteps=int(1e6), log_interval=10, eval_freq=10000) diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index d420e03..2944d8c 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -166,7 +166,7 @@ class CategoricalDistribution(Distribution): class StateDependentNoiseDistribution(Distribution): - def __init__(self, features_dim, action_dim): + def __init__(self, features_dim, action_dim, use_expln=False): super(StateDependentNoiseDistribution, self).__init__() self.distribution = None self.action_dim = action_dim @@ -175,19 +175,28 @@ class StateDependentNoiseDistribution(Distribution): self.log_std = None self.weights_dist = None self.noise_weights = None + self.use_expln = use_expln - @staticmethod - def get_std(log_std): - # TODO: use expln instead of exp only to avoid sigma growing too fast - return th.exp(log_std) + def get_std(self, log_std): + if self.use_expln: + # From SDE paper, it allows to keep variance + # above zero and prevent it from growing too fast + if log_std <= 0: + return th.exp(log_std) + else: + return th.log(log_std + 1.0) + 1.0 + else: + return th.exp(log_std) def sample_weights(self, log_std): self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std)) self.noise_weights = self.weights_dist.rsample() - def proba_distribution_net(self, latent_dim, log_std_init=0.0): + def proba_distribution_net(self, latent_dim, log_std_init=-3): + print("Log std init:", log_std_init) mean_actions = nn.Linear(latent_dim, self.action_dim) - log_std = nn.Parameter(th.zeros(self.features_dim, self.action_dim)) + # TODO: log_std_init depending on the number of layers? + log_std = nn.Parameter(th.ones(self.features_dim, self.action_dim) * log_std_init) self.sample_weights(log_std) return mean_actions, log_std From c0cb9fc9c57e0a3fd14c5675c2bc206d64f562f4 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 29 Oct 2019 18:30:36 +0100 Subject: [PATCH 05/19] Fix predict method --- torchy_baselines/ppo/ppo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 8171eb5..8127307 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -127,14 +127,14 @@ class PPO(BaseRLModel): if self.clip_range_vf is not None: self.clip_range_vf = get_schedule_fn(self.clip_range_vf) - def select_action(self, observation): + def select_action(self, observation, deterministic=False): # Normally not needed observation = np.array(observation) with th.no_grad(): observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device) - return self.policy.actor_forward(observation, deterministic=False) + return self.policy.actor_forward(observation, deterministic=deterministic) - def predict(self, observation, state=None, mask=None, deterministic=True): + def predict(self, observation, state=None, mask=None, deterministic=False): """ Get the model's action from an observation @@ -144,7 +144,7 @@ class PPO(BaseRLModel): :param deterministic: (bool) Whether or not to return deterministic actions. :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) """ - clipped_actions = self.select_action(observation) + clipped_actions = self.select_action(observation, deterministic=deterministic) if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(clipped_actions, self.action_space.low, self.action_space.high) return clipped_actions From 9e8f6e00201251d3598e3c3ea279ed100456eac0 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 29 Oct 2019 18:42:34 +0100 Subject: [PATCH 06/19] Add default filename for monitor --- torchy_baselines/common/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchy_baselines/common/monitor.py b/torchy_baselines/common/monitor.py index c241de9..53bea70 100644 --- a/torchy_baselines/common/monitor.py +++ b/torchy_baselines/common/monitor.py @@ -13,7 +13,7 @@ class Monitor(Wrapper): EXT = "monitor.csv" file_handler = None - def __init__(self, env, filename, allow_early_resets=True, reset_keywords=(), info_keywords=()): + def __init__(self, env, filename=None, allow_early_resets=True, reset_keywords=(), info_keywords=()): """ A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data. From 0174ec269e341acc15675f66ebb559100ee59662 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 29 Oct 2019 18:43:16 +0100 Subject: [PATCH 07/19] Clean up --- torchy_baselines/a2c/a2c.py | 4 +++- torchy_baselines/common/distributions.py | 5 ++--- torchy_baselines/ppo/policies.py | 18 +++++++----------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 6b51227..51eb846 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -112,6 +112,7 @@ class A2C(PPO): # Optimization step self.policy.optimizer.zero_grad() loss.backward() + # Clip grad norm th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) self.policy.optimizer.step() @@ -123,9 +124,10 @@ class A2C(PPO): logger.logkv("entropy", entropy.mean().item()) logger.logkv("policy_loss", policy_loss.item()) logger.logkv("value_loss", value_loss.item()) + logger.logkv("std", th.exp(self.policy.log_std).mean().item()) if self.use_sde: - logger.logkv("noise net std", th.exp(self.policy.log_std).mean().item()) + pass # print(th.exp(self.policy.log_std).detach()) diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index 2944d8c..5c9cdac 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -180,7 +180,7 @@ class StateDependentNoiseDistribution(Distribution): def get_std(self, log_std): if self.use_expln: # From SDE paper, it allows to keep variance - # above zero and prevent it from growing too fast + # above zero and prevent it from growing too fast if log_std <= 0: return th.exp(log_std) else: @@ -192,8 +192,7 @@ class StateDependentNoiseDistribution(Distribution): self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std)) self.noise_weights = self.weights_dist.rsample() - def proba_distribution_net(self, latent_dim, log_std_init=-3): - print("Log std init:", log_std_init) + def proba_distribution_net(self, latent_dim, log_std_init=-1): mean_actions = nn.Linear(latent_dim, self.action_dim) # TODO: log_std_init depending on the number of layers? log_std = nn.Parameter(th.ones(self.features_dim, self.action_dim) * log_std_init) diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index 08dd7dd..25d0bf3 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -103,7 +103,7 @@ class PPOPolicy(BasePolicy): def __init__(self, observation_space, action_space, learning_rate, net_arch=None, device='cpu', activation_fn=nn.Tanh, adam_epsilon=1e-5, - ortho_init=True, use_sde=False): + ortho_init=True, use_sde=False, log_std_init=0.0): super(PPOPolicy, self).__init__(observation_space, action_space, device) self.obs_dim = self.observation_space.shape[0] if net_arch is None: @@ -123,6 +123,7 @@ class PPOPolicy(BasePolicy): # In the future, feature_extractor will be replaced with a CNN self.features_extractor = nn.Flatten() self.features_dim = self.obs_dim + self.log_std_init = log_std_init # Action distribution self.action_dist = make_proba_distribution(action_space, self.features_dim, use_sde=use_sde) @@ -130,22 +131,14 @@ class PPOPolicy(BasePolicy): def reset_noise_net(self): self.action_dist.sample_weights(self.log_std) - # weights_dist = Normal(th.zeros_like(self.noise_log_sigma), th.exp(self.noise_log_sigma)) - # self.noise_net = weights_dist.rsample() - # noise = th.mm(state, weights) - # variance = th.mm(state ** 2, sigma ** 2) - # action_dist = Normal(mu, th.sqrt(variance)) - # # action_dist.log_prob((mu + noise).detach()) - # action_dist.log_prob(action) - # # action_dist = Normal(mu_j + noise_j, sum of s_i * sigma_ij) - # # log_prob = distribution.log_prob(self.noise_net) def _build(self, learning_rate): self.mlp_extractor = MlpExtractor(self.features_dim, net_arch=self.net_arch, activation_fn=self.activation_fn, device=self.device) if isinstance(self.action_dist, (DiagGaussianDistribution, StateDependentNoiseDistribution)): - self.action_net, self.log_std = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi) + self.action_net, self.log_std = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi, + log_std_init=self.log_std_init) elif isinstance(self.action_dist, CategoricalDistribution): self.action_net = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi) @@ -177,10 +170,13 @@ class PPOPolicy(BasePolicy): def _get_action_dist_from_latent(self, latent, obs, deterministic=False): mean_actions = self.action_net(latent) + if isinstance(self.action_dist, DiagGaussianDistribution): return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic) + elif isinstance(self.action_dist, CategoricalDistribution): return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic) + elif isinstance(self.action_dist, StateDependentNoiseDistribution): return self.action_dist.proba_distribution(mean_actions, self.log_std, obs, deterministic=deterministic) From 862ae666b531463158e901d163b7933e9622b10d Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 30 Oct 2019 15:30:09 +0100 Subject: [PATCH 08/19] Try squashing the sde --- torchy_baselines/common/distributions.py | 32 ++++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index 5c9cdac..a62ede8 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -119,7 +119,9 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution): # Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x)) # We use numpy to avoid numerical instability if gaussian_action is None: - gaussian_action = th.from_numpy(np.arctanh(action.cpu().numpy())).to(action.device) + # Clip to avoid NaN + clipped_action = np.clip(action.cpu().numpy(), -1.0 + self.epsilon, 1.0 + self.epsilon) + gaussian_action = th.from_numpy(np.arctanh(clipped_action)).to(action.device) # Log likelihood for a gaussian distribution log_prob = super(SquashedDiagGaussianDistribution, self).log_prob(gaussian_action) @@ -166,7 +168,8 @@ class CategoricalDistribution(Distribution): class StateDependentNoiseDistribution(Distribution): - def __init__(self, features_dim, action_dim, use_expln=False): + def __init__(self, features_dim, action_dim, use_expln=False, + squash_output=True, epsilon=1e-6): super(StateDependentNoiseDistribution, self).__init__() self.distribution = None self.action_dim = action_dim @@ -175,7 +178,10 @@ class StateDependentNoiseDistribution(Distribution): self.log_std = None self.weights_dist = None self.noise_weights = None + self.gaussian_action = None self.use_expln = use_expln + self.squash_output = squash_output + self.epsilon = epsilon def get_std(self, log_std): if self.use_expln: @@ -210,13 +216,20 @@ class StateDependentNoiseDistribution(Distribution): return action, self def mode(self): - return self.distribution.mean + self.gaussian_action = self.distribution.mean + if self.squash_output: + return th.tanh(self.gaussian_action) + return self.gaussian_action def sample(self, observations): noise = th.mm(observations, self.noise_weights) - return self.distribution.mean + noise + self.gaussian_action = self.distribution.mean + noise + if self.squash_output: + return th.tanh(self.gaussian_action) + return self.gaussian_action def entropy(self): + # TODO: account for the squashing? return self.distribution.entropy() def log_prob_from_params(self, mean_actions, log_std, observations): @@ -225,11 +238,20 @@ class StateDependentNoiseDistribution(Distribution): return action, log_prob def log_prob(self, action): - log_prob = self.distribution.log_prob(action) + if self.squash_output: + gaussian_action = self.gaussian_action + else: + gaussian_action = action + # log likelihood for a gaussian + log_prob = self.distribution.log_prob(gaussian_action) + # log_prob = self.distribution.log_prob(action) if len(log_prob.shape) > 1: log_prob = log_prob.sum(axis=1) else: log_prob = log_prob.sum() + if self.squash_output: + # Squash correction (from original SAC implementation) + log_prob -= th.sum(th.log(1 - action ** 2 + self.epsilon), dim=1) return log_prob From 925afe784c595c114d5b3c5f4e81766c8297dca3 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 31 Oct 2019 11:44:27 +0100 Subject: [PATCH 09/19] SDE on latent_pi --- torchy_baselines/a2c/a2c.py | 1 - torchy_baselines/common/distributions.py | 97 ++++++++++++++++-------- torchy_baselines/ppo/policies.py | 16 ++-- 3 files changed, 73 insertions(+), 41 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 51eb846..d355e09 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -112,7 +112,6 @@ class A2C(PPO): # Optimization step self.policy.optimizer.zero_grad() loss.backward() - # Clip grad norm th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) self.policy.optimizer.step() diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index a62ede8..72bf7b2 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -2,6 +2,7 @@ import numpy as np import torch as th import torch.nn as nn from torch.distributions import Normal, Categorical +import torch.nn.functional as F from gym import spaces class Distribution(object): @@ -168,20 +169,20 @@ class CategoricalDistribution(Distribution): class StateDependentNoiseDistribution(Distribution): - def __init__(self, features_dim, action_dim, use_expln=False, - squash_output=True, epsilon=1e-6): + def __init__(self, action_dim, use_expln=False, + squash_output=False, epsilon=1e-6): super(StateDependentNoiseDistribution, self).__init__() self.distribution = None self.action_dim = action_dim - self.features_dim = features_dim self.mean_actions = None self.log_std = None self.weights_dist = None - self.noise_weights = None - self.gaussian_action = None + self.exploration_mat = None self.use_expln = use_expln - self.squash_output = squash_output - self.epsilon = epsilon + if squash_output: + self.bijector = TanhBijector(epsilon) + else: + self.bijector = None def get_std(self, log_std): if self.use_expln: @@ -196,71 +197,103 @@ class StateDependentNoiseDistribution(Distribution): def sample_weights(self, log_std): self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std)) - self.noise_weights = self.weights_dist.rsample() + self.exploration_mat = self.weights_dist.rsample() def proba_distribution_net(self, latent_dim, log_std_init=-1): mean_actions = nn.Linear(latent_dim, self.action_dim) # TODO: log_std_init depending on the number of layers? - log_std = nn.Parameter(th.ones(self.features_dim, self.action_dim) * log_std_init) + log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init) self.sample_weights(log_std) return mean_actions, log_std - def proba_distribution(self, mean_actions, log_std, observations, deterministic=False): - variance = th.mm(observations ** 2, self.get_std(log_std) ** 2) + def proba_distribution(self, mean_actions, log_std, latent_pi, deterministic=False): + # TODO: try without detach + variance = th.mm(latent_pi.detach() ** 2, self.get_std(log_std) ** 2) self.distribution = Normal(mean_actions, th.sqrt(variance)) if deterministic: action = self.mode() else: - action = self.sample(observations) + action = self.sample(latent_pi) return action, self def mode(self): - self.gaussian_action = self.distribution.mean - if self.squash_output: - return th.tanh(self.gaussian_action) - return self.gaussian_action + action = self.distribution.mean + if self.bijector is not None: + return self.bijector.forward(action) + return action - def sample(self, observations): - noise = th.mm(observations, self.noise_weights) - self.gaussian_action = self.distribution.mean + noise - if self.squash_output: - return th.tanh(self.gaussian_action) - return self.gaussian_action + def sample(self, latent_pi): + noise = th.mm(latent_pi.detach(), self.exploration_mat) + action = self.distribution.mean + noise + if self.bijector is not None: + return self.bijector.forward(action) + return action def entropy(self): # TODO: account for the squashing? return self.distribution.entropy() - def log_prob_from_params(self, mean_actions, log_std, observations): - action, _ = self.proba_distribution(mean_actions, log_std, observations) + def log_prob_from_params(self, mean_actions, log_std, latent_pi): + action, _ = self.proba_distribution(mean_actions, log_std, latent_pi) log_prob = self.log_prob(action) return action, log_prob def log_prob(self, action): - if self.squash_output: - gaussian_action = self.gaussian_action + if self.bijector is not None: + gaussian_action = self.bijector.inverse(action) else: gaussian_action = action # log likelihood for a gaussian log_prob = self.distribution.log_prob(gaussian_action) - # log_prob = self.distribution.log_prob(action) + if len(log_prob.shape) > 1: log_prob = log_prob.sum(axis=1) else: log_prob = log_prob.sum() - if self.squash_output: + + if self.bijector is not None: # Squash correction (from original SAC implementation) - log_prob -= th.sum(th.log(1 - action ** 2 + self.epsilon), dim=1) + log_prob -= th.sum(self.bijector.log_prob_correction(gaussian_action), dim=1) return log_prob -def make_proba_distribution(action_space, features_dim=None, use_sde=False): +class TanhBijector(object): + def __init__(self, epsilon=1e-6): + super(TanhBijector, self).__init__() + self.epsilon = epsilon + + def forward(self, x): + return th.tanh(x) + + def inverse(self, action): + """ + Inverse tanh. + + From https://github.com/tensorflow/agents: + 0.99999997 is the maximum value such that atanh(x) is valid for both + float32 and float64 + + :param action: (th.Tensor) + :return: (th.Tensor) + """ + # Inverse tanh + # Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x)) + # We use numpy to avoid numerical instability + # Note: Using numpy, we do not keep the gradient + clipped_action = np.clip(action.cpu().numpy(), -0.99999997, 0.99999997) + return th.from_numpy(np.arctanh(clipped_action)).to(action.device) + + def log_prob_correction(self, x): + # Squash correction (from original SAC implementation) + return th.log(1 - th.tanh(x) ** 2 + self.epsilon) + + +def make_proba_distribution(action_space, use_sde=False): """ Return an instance of Distribution for the correct type of action space :param action_space: (Gym Space) the input action space - :param feature_dim: (int) Dimension of the feature vector :param use_sde: (bool) Force the use of StateDependentNoiseDistribution instead of DiagGaussianDistribution :return: (Distribution) the approriate Distribution object @@ -268,7 +301,7 @@ def make_proba_distribution(action_space, features_dim=None, use_sde=False): if isinstance(action_space, spaces.Box): assert len(action_space.shape) == 1, "Error: the action space must be a vector" if use_sde: - return StateDependentNoiseDistribution(features_dim, action_space.shape[0]) + return StateDependentNoiseDistribution(action_space.shape[0]) return DiagGaussianDistribution(action_space.shape[0]) elif isinstance(action_space, spaces.Discrete): return CategoricalDistribution(action_space.n) diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index 25d0bf3..1e3b25c 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -125,7 +125,7 @@ class PPOPolicy(BasePolicy): self.features_dim = self.obs_dim self.log_std_init = log_std_init # Action distribution - self.action_dist = make_proba_distribution(action_space, self.features_dim, use_sde=use_sde) + self.action_dist = make_proba_distribution(action_space, use_sde=use_sde) self._build(learning_rate) @@ -161,15 +161,15 @@ class PPOPolicy(BasePolicy): obs = th.FloatTensor(obs).to(self.device) latent_pi, latent_vf = self._get_latent(obs) value = self.value_net(latent_vf) - action, action_distribution = self._get_action_dist_from_latent(latent_pi, obs, deterministic=deterministic) + action, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic) log_prob = action_distribution.log_prob(action) return action, value, log_prob def _get_latent(self, obs): return self.mlp_extractor(self.features_extractor(obs)) - def _get_action_dist_from_latent(self, latent, obs, deterministic=False): - mean_actions = self.action_net(latent) + def _get_action_dist_from_latent(self, latent_pi, deterministic=False): + mean_actions = self.action_net(latent_pi) if isinstance(self.action_dist, DiagGaussianDistribution): return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic) @@ -178,16 +178,16 @@ class PPOPolicy(BasePolicy): return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic) elif isinstance(self.action_dist, StateDependentNoiseDistribution): - return self.action_dist.proba_distribution(mean_actions, self.log_std, obs, deterministic=deterministic) + return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_pi, deterministic=deterministic) def actor_forward(self, obs, deterministic=False): latent_pi, _ = self._get_latent(obs) - action, _ = self._get_action_dist_from_latent(latent_pi, obs, deterministic=deterministic) + action, _ = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic) return action.detach().cpu().numpy() - def get_policy_stats(self, obs, action): + def get_policy_stats(self, obs, action, deterministic=False): latent_pi, latent_vf = self._get_latent(obs) - _, action_distribution = self._get_action_dist_from_latent(latent_pi, obs) + _, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic) log_prob = action_distribution.log_prob(action) value = self.value_net(latent_vf) return value, log_prob, action_distribution.entropy() From 72a6f18e4309c101da565e0b816cfdf6ea6f55b5 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 31 Oct 2019 14:14:30 +0100 Subject: [PATCH 10/19] Add sde test + fix random seed --- tests/test_sde.py | 27 ++++++++++++++++-------- torchy_baselines/common/base_class.py | 4 +++- torchy_baselines/common/distributions.py | 3 +-- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/tests/test_sde.py b/tests/test_sde.py index 3c5db43..f397558 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -7,24 +7,22 @@ from torchy_baselines import A2C def test_state_dependent_exploration(): + n_states = 2 state_dim = 3 # TODO: fix for action_dim > 1 action_dim = 1 sigma = th.ones(state_dim, action_dim, requires_grad=True) - # log_sigma = th.ones(2, 1, requires_grad=True) - # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma)) th.manual_seed(2) weights_dist = Normal(th.zeros_like(sigma), sigma) weights = weights_dist.rsample() - state = th.rand(1, state_dim) - # state = (th.ones(state_dim,) * 2).view(1, -1) + state = th.rand(n_states, state_dim) mu = th.ones(action_dim) # print(weights.shape, state.shape) noise = th.mm(state, weights) - # variance = th.mm(state ** 2, th.exp(log_sigma) ** 2) + variance = th.mm(state ** 2, sigma ** 2) action_dist = Normal(mu, th.sqrt(variance)) @@ -35,7 +33,8 @@ def test_state_dependent_exploration(): grad = th.zeros_like(sigma) for j in range(action_dim): for i in range(state_dim): - grad[i, j] = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j]) + a = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j]) + grad[i, j] = a.mean() # sigma.grad should be equal to grad assert sigma.grad.allclose(grad) @@ -43,6 +42,16 @@ def test_state_dependent_exploration(): @pytest.mark.parametrize("model_class", [A2C]) def test_state_dependent_noise(model_class): - model = model_class('MlpPolicy', 'Pendulum-v0', n_steps=200, - use_sde=True, ent_coef=0.0, verbose=1, create_eval_env=True) - model.learn(total_timesteps=int(1e6), log_interval=10, eval_freq=10000) + import gym + from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize + from torchy_baselines.common.monitor import Monitor + + # env_id = 'Pendulum-v0' + env_id = 'MountainCarContinuous-v0' + # env_id = 'LunarLanderContinuous-v2' + env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True) + eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False) + model = model_class('MlpPolicy', env, n_steps=200, max_grad_norm=1, use_rms_prop=False, + use_sde=True, ent_coef=0.00, verbose=1, create_eval_env=True, learning_rate=3e-4, + policy_kwargs=dict(log_std_init=0.0, ortho_init=False, net_arch=[256, dict(pi=[256], vf=[256])]), seed=None) + model.learn(total_timesteps=int(20000), log_interval=5, eval_freq=10000, eval_env=eval_env) diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 6adc45c..a6b9a41 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -282,7 +282,9 @@ class BaseRLModel(object): """ raise NotImplementedError() - def set_random_seed(self, seed=0): + def set_random_seed(self, seed=None): + if seed is None: + return set_random_seed(seed, using_cuda=self.device == th.device('cuda')) self.action_space.seed(seed) if self.env is not None: diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index 72bf7b2..90105ee 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -199,9 +199,8 @@ class StateDependentNoiseDistribution(Distribution): self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std)) self.exploration_mat = self.weights_dist.rsample() - def proba_distribution_net(self, latent_dim, log_std_init=-1): + def proba_distribution_net(self, latent_dim, log_std_init=0.0): mean_actions = nn.Linear(latent_dim, self.action_dim) - # TODO: log_std_init depending on the number of layers? log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init) self.sample_weights(log_std) return mean_actions, log_std From 9644ae89cfa77f920a4f7c779595b3ebbb374c2a Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 31 Oct 2019 16:17:08 +0100 Subject: [PATCH 11/19] Log ppo std --- torchy_baselines/ppo/ppo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 8127307..bbf880b 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -253,6 +253,7 @@ class PPO(BaseRLModel): logger.logkv("entropy", entropy.mean().item()) logger.logkv("policy_loss", policy_loss.item()) logger.logkv("value_loss", value_loss.item()) + logger.logkv("std", th.exp(self.policy.log_std).mean().item()) def learn(self, total_timesteps, callback=None, log_interval=1, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True): From 0e092f7c528d86bdfa348f70899790edb65770d5 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 31 Oct 2019 16:59:35 +0100 Subject: [PATCH 12/19] Add plotting script --- torchy_baselines/a2c/a2c.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index d355e09..fed4290 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -64,6 +64,8 @@ class A2C(PPO): self.normalize_advantage = normalize_advantage self.rms_prop_eps = rms_prop_eps self.use_rms_prop = use_rms_prop + self.actions = [] + self.states = [] if _init_setup_model: self._setup_model() @@ -125,12 +127,39 @@ class A2C(PPO): logger.logkv("value_loss", value_loss.item()) logger.logkv("std", th.exp(self.policy.log_std).mean().item()) - if self.use_sde: - pass - # print(th.exp(self.policy.log_std).detach()) + self.states.append(self.rollout_buffer.observations.cpu().numpy()) + self.actions.append(self.rollout_buffer.actions.cpu().numpy()) + + # Plot for MountainCarContinuous-v0 + if True: + if len(self.actions) > 10: + import matplotlib.pyplot as plt + import numpy as np + actions = np.concatenate(self.actions) + x = np.arange(len(actions)) + plt.figure("actions") + start = 0 + for i in range(len(self.actions)): + end = start + len(self.actions[i]) + # plt.plot(x[start:end], self.actions[i]) + # Clipped actions: real behavior, note that it is between [-2, 2] for the Pendulum + plt.scatter(x[start:end], np.clip(self.actions[i], -1, 1), s=1) + # plt.scatter(x[start:end], self.actions[i], s=1) + start = end + + plt.figure("states") + for i in range(len(self.states)): + if len(self.states[i].shape) > 1: + # plt.plot(self.states[i][:, 0], self.states[i][:, 1]) + plt.scatter(self.states[i][:, 0], self.states[i][:, 1], s=1) + else: + plt.scatter(x[start:end], self.states[i], s=1) + + plt.show() + import ipdb; ipdb.set_trace() - def learn(self, total_timesteps, callback=None, log_interval=100, + def learn(self, total_timesteps, callback=None, log_interval=5, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True): return super(A2C, self).learn(total_timesteps=total_timesteps, callback=callback, log_interval=log_interval, From 9acff0f5b37ea57932279b774c17ad1030f9e74f Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 31 Oct 2019 17:01:27 +0100 Subject: [PATCH 13/19] Remove plotting script --- torchy_baselines/a2c/a2c.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index fed4290..8807c91 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -64,8 +64,6 @@ class A2C(PPO): self.normalize_advantage = normalize_advantage self.rms_prop_eps = rms_prop_eps self.use_rms_prop = use_rms_prop - self.actions = [] - self.states = [] if _init_setup_model: self._setup_model() @@ -127,38 +125,6 @@ class A2C(PPO): logger.logkv("value_loss", value_loss.item()) logger.logkv("std", th.exp(self.policy.log_std).mean().item()) - self.states.append(self.rollout_buffer.observations.cpu().numpy()) - self.actions.append(self.rollout_buffer.actions.cpu().numpy()) - - # Plot for MountainCarContinuous-v0 - if True: - if len(self.actions) > 10: - import matplotlib.pyplot as plt - import numpy as np - actions = np.concatenate(self.actions) - x = np.arange(len(actions)) - plt.figure("actions") - start = 0 - for i in range(len(self.actions)): - end = start + len(self.actions[i]) - # plt.plot(x[start:end], self.actions[i]) - # Clipped actions: real behavior, note that it is between [-2, 2] for the Pendulum - plt.scatter(x[start:end], np.clip(self.actions[i], -1, 1), s=1) - # plt.scatter(x[start:end], self.actions[i], s=1) - start = end - - plt.figure("states") - for i in range(len(self.states)): - if len(self.states[i].shape) > 1: - # plt.plot(self.states[i][:, 0], self.states[i][:, 1]) - plt.scatter(self.states[i][:, 0], self.states[i][:, 1], s=1) - else: - plt.scatter(x[start:end], self.states[i], s=1) - - plt.show() - import ipdb; ipdb.set_trace() - - def learn(self, total_timesteps, callback=None, log_interval=5, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True): From 6c7c8375a47cd4ecaa53dcf8a3571cff57f81235 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 7 Nov 2019 11:16:59 +0100 Subject: [PATCH 14/19] Update log interval --- torchy_baselines/a2c/a2c.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 8807c91..9e6af14 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -125,7 +125,7 @@ class A2C(PPO): logger.logkv("value_loss", value_loss.item()) logger.logkv("std", th.exp(self.policy.log_std).mean().item()) - def learn(self, total_timesteps, callback=None, log_interval=5, + def learn(self, total_timesteps, callback=None, log_interval=100, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True): return super(A2C, self).learn(total_timesteps=total_timesteps, callback=callback, log_interval=log_interval, From c6f90b9c3c62f3f4e6869f49982d0989f15e6aa4 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 7 Nov 2019 11:17:26 +0100 Subject: [PATCH 15/19] Improve VecNormalize syncing for evaluation --- torchy_baselines/common/distributions.py | 2 +- torchy_baselines/ppo/ppo.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index 90105ee..37f4cf8 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -180,6 +180,7 @@ class StateDependentNoiseDistribution(Distribution): self.exploration_mat = None self.use_expln = use_expln if squash_output: + print("== Using TanhBijector ===") self.bijector = TanhBijector(epsilon) else: self.bijector = None @@ -206,7 +207,6 @@ class StateDependentNoiseDistribution(Distribution): return mean_actions, log_std def proba_distribution(self, mean_actions, log_std, latent_pi, deterministic=False): - # TODO: try without detach variance = th.mm(latent_pi.detach() ** 2, self.get_std(log_std) ** 2) self.distribution = Normal(mean_actions, th.sqrt(variance)) diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index bbf880b..32e5f95 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -17,7 +17,7 @@ from torchy_baselines.common.base_class import BaseRLModel from torchy_baselines.common.evaluation import evaluate_policy from torchy_baselines.common.buffers import RolloutBuffer from torchy_baselines.common.utils import explained_variance, get_schedule_fn -from torchy_baselines.common.vec_env import VecNormalize +from torchy_baselines.common.vec_env import VecNormalize, VecEnvWrapper from torchy_baselines.common import logger from torchy_baselines.ppo.policies import PPOPolicy @@ -294,9 +294,14 @@ class PPO(BaseRLModel): # Evaluate agent if 0 < eval_freq <= timesteps_since_eval and eval_env is not None: timesteps_since_eval %= eval_freq + # TODO: move that to the base class # Sync eval env and train env when using VecNormalize - if isinstance(self.env, VecNormalize): - eval_env.obs_rms = deepcopy(self.env.obs_rms) + env_tmp, eval_env_tmp = self.env, eval_env + while isinstance(env_tmp, VecEnvWrapper): + if isinstance(env_tmp, VecNormalize): + eval_env_tmp.obs_rms = deepcopy(env_tmp.obs_rms) + env_tmp = env_tmp.venv + eval_env_tmp.venv mean_reward, _ = evaluate_policy(self, eval_env, n_eval_episodes) if self.tb_writer is not None: self.tb_writer.add_scalar('Eval/reward', mean_reward, self.num_timesteps) From 95c741c7073ce2a318af769e765c2f6cf7be4f3b Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 7 Nov 2019 17:01:02 +0100 Subject: [PATCH 16/19] Fix logger for discrete actions --- torchy_baselines/a2c/a2c.py | 3 ++- torchy_baselines/ppo/ppo.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 9e6af14..41ca60c 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -123,7 +123,8 @@ class A2C(PPO): logger.logkv("entropy", entropy.mean().item()) logger.logkv("policy_loss", policy_loss.item()) logger.logkv("value_loss", value_loss.item()) - logger.logkv("std", th.exp(self.policy.log_std).mean().item()) + if hasattr(self.policy, 'log_std'): + logger.logkv("std", th.exp(self.policy.log_std).mean().item()) def learn(self, total_timesteps, callback=None, log_interval=100, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True): diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 32e5f95..2b2a5e6 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -253,7 +253,8 @@ class PPO(BaseRLModel): logger.logkv("entropy", entropy.mean().item()) logger.logkv("policy_loss", policy_loss.item()) logger.logkv("value_loss", value_loss.item()) - logger.logkv("std", th.exp(self.policy.log_std).mean().item()) + if hasattr(self.policy, 'log_std'): + logger.logkv("std", th.exp(self.policy.log_std).mean().item()) def learn(self, total_timesteps, callback=None, log_interval=1, eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True): From 5d353d598ca53c0a20a859f66f4f2a05ea41627c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 18 Nov 2019 14:09:31 +0100 Subject: [PATCH 17/19] Start cleanup + update docstrings --- tests/test_sde.py | 25 +++++++++++--------- torchy_baselines/common/distributions.py | 29 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/tests/test_sde.py b/tests/test_sde.py index f397558..03c8f62 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -1,17 +1,25 @@ import pytest +import gym import torch as th from torch.distributions import Normal from torchy_baselines import A2C +from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize +from torchy_baselines.common.monitor import Monitor def test_state_dependent_exploration(): + """ + Check that the gradient correspond to the expected one + """ n_states = 2 state_dim = 3 # TODO: fix for action_dim > 1 action_dim = 1 - sigma = th.ones(state_dim, action_dim, requires_grad=True) + sigma = th.ones(state_dim, 1, requires_grad=True) + # Reduce the number of parameters + # sigma_ = th.ones(state_dim, action_dim) * sigma_ # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma)) th.manual_seed(2) @@ -42,16 +50,11 @@ def test_state_dependent_exploration(): @pytest.mark.parametrize("model_class", [A2C]) def test_state_dependent_noise(model_class): - import gym - from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize - from torchy_baselines.common.monitor import Monitor - - # env_id = 'Pendulum-v0' env_id = 'MountainCarContinuous-v0' - # env_id = 'LunarLanderContinuous-v2' + env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True) eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False) - model = model_class('MlpPolicy', env, n_steps=200, max_grad_norm=1, use_rms_prop=False, - use_sde=True, ent_coef=0.00, verbose=1, create_eval_env=True, learning_rate=3e-4, - policy_kwargs=dict(log_std_init=0.0, ortho_init=False, net_arch=[256, dict(pi=[256], vf=[256])]), seed=None) - model.learn(total_timesteps=int(20000), log_interval=5, eval_freq=10000, eval_env=eval_env) + + model = model_class('MlpPolicy', env, n_steps=200, use_sde=True, ent_coef=0.00, verbose=1, learning_rate=3e-4, + policy_kwargs=dict(log_std_init=0.0, ortho_init=False), seed=None) + model.learn(total_timesteps=int(1000), log_interval=5, eval_freq=500, eval_env=eval_env) diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index 37f4cf8..82a07a2 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -45,6 +45,11 @@ class Distribution(object): class DiagGaussianDistribution(Distribution): + """ + Gaussian distribution with diagonal covariance matrix. + + :param action_dim: (int) Number of actions + """ def __init__(self, action_dim): super(DiagGaussianDistribution, self).__init__() self.distribution = None @@ -53,12 +58,28 @@ class DiagGaussianDistribution(Distribution): self.log_std = None def proba_distribution_net(self, latent_dim, log_std_init=0.0): + """ + Create the layers and parameter that represent the distribution: + one output will be the mean of the gaussian, the other parameter will be the + standard deviation (log std in fact to allow negative values) + + :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer) + :param log_std_init: (float) Initial value for the log standard deviation + """ mean_actions = nn.Linear(latent_dim, self.action_dim) # TODO: allow action dependent std log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init) return mean_actions, log_std def proba_distribution(self, mean_actions, log_std, deterministic=False): + """ + Create and sample for the distribution given its parameters (mean, std) + + :param mean_actions: (th.Tensor) + :param log_std: (th.Tensor) + :param deterministic: (bool) + :return: (th.Tensor) + """ action_std = th.ones_like(mean_actions) * log_std.exp() self.distribution = Normal(mean_actions, action_std) if deterministic: @@ -77,6 +98,14 @@ class DiagGaussianDistribution(Distribution): return self.distribution.entropy() def log_prob_from_params(self, mean_actions, log_std): + """ + Compute the log probabilty of taking an action + given the distribution parameters. + + :param mean_actions: (th.Tensor) + :param log_std: (th.Tensor) + :return: (th.Tensor, th.Tensor) + """ action, _ = self.proba_distribution(mean_actions, log_std) log_prob = self.log_prob(action) return action, log_prob From b9c20d443d7d0d719ba59635e9357f8a53ed98e9 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 18 Nov 2019 15:04:07 +0100 Subject: [PATCH 18/19] Update doc + add test for tanh bijector --- tests/test_distributions.py | 20 ++++ torchy_baselines/common/distributions.py | 122 +++++++++++++++++++---- 2 files changed, 124 insertions(+), 18 deletions(-) create mode 100644 tests/test_distributions.py diff --git a/tests/test_distributions.py b/tests/test_distributions.py new file mode 100644 index 0000000..47651e4 --- /dev/null +++ b/tests/test_distributions.py @@ -0,0 +1,20 @@ +import numpy as np +import torch as th + +from torchy_baselines.common.distributions import DiagGaussianDistribution, SquashedDiagGaussianDistribution,\ + CategoricalDistribution, TanhBijector + +# TODO: more tests for the other distributions +def test_bijector(): + """ + Test TanhBijector + """ + actions = th.ones(5) * 2.0 + + bijector = TanhBijector() + + squashed_actions = bijector.forward(actions) + # Check that the boundaries are not violated + assert th.max(th.abs(squashed_actions)) <= 1.0 + # Check the inverse method + assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all() diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index 82a07a2..9d67b39 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -46,9 +46,10 @@ class Distribution(object): class DiagGaussianDistribution(Distribution): """ - Gaussian distribution with diagonal covariance matrix. + Gaussian distribution with diagonal covariance matrix, + for continuous actions. - :param action_dim: (int) Number of actions + :param action_dim: (int) Number of continuous actions """ def __init__(self, action_dim): super(DiagGaussianDistribution, self).__init__() @@ -65,6 +66,7 @@ class DiagGaussianDistribution(Distribution): :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer) :param log_std_init: (float) Initial value for the log standard deviation + :return: (nn.Linear, nn.Parameter) """ mean_actions = nn.Linear(latent_dim, self.action_dim) # TODO: allow action dependent std @@ -111,6 +113,14 @@ class DiagGaussianDistribution(Distribution): return action, log_prob def log_prob(self, action): + """ + Get the log probabilty of an action given a distribution. + Note that you must call `proba_distribution()` method + before. + + :param action: (th.Tensor) + :return: (th.Tensor) + """ log_prob = self.distribution.log_prob(action) if len(log_prob.shape) > 1: log_prob = log_prob.sum(axis=1) @@ -120,6 +130,13 @@ class DiagGaussianDistribution(Distribution): class SquashedDiagGaussianDistribution(DiagGaussianDistribution): + """ + Gaussian distribution with diagonal covariance matrix, + followed by a squashing function (tanh) to ensure bounds. + + :param action_dim: (int) Number of continuous actions + :param epsilon: (float) small value to avoid NaN due to numerical imprecision. + """ def __init__(self, action_dim, epsilon=1e-6): super(SquashedDiagGaussianDistribution, self).__init__(action_dim) # Avoid NaN (prevents division by zero or log of zero) @@ -146,27 +163,40 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution): def log_prob(self, action, gaussian_action=None): # Inverse tanh - # Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x)) + # Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x)) # We use numpy to avoid numerical instability if gaussian_action is None: - # Clip to avoid NaN - clipped_action = np.clip(action.cpu().numpy(), -1.0 + self.epsilon, 1.0 + self.epsilon) - gaussian_action = th.from_numpy(np.arctanh(clipped_action)).to(action.device) + # It will be clipped to avoid NaN when inversing tanh + gaussian_action = TanhBijector.inverse(action) # Log likelihood for a gaussian distribution log_prob = super(SquashedDiagGaussianDistribution, self).log_prob(gaussian_action) # Squash correction (from original SAC implementation) + # this comes from the fact that tanh is bijective and differentiable log_prob -= th.sum(th.log(1 - action ** 2 + self.epsilon), dim=1) return log_prob class CategoricalDistribution(Distribution): + """ + Categorical distribution for discrete actions. + + :param action_dim: (int) Number of discrete actions + """ def __init__(self, action_dim): super(CategoricalDistribution, self).__init__() self.distribution = None self.action_dim = action_dim def proba_distribution_net(self, latent_dim): + """ + Create the layer that represents the distribution: + it will be the logits of the Categorical distribution. + You can then get probabilties using a softmax. + + :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer) + :return: (nn.Linear) + """ action_logits = nn.Linear(latent_dim, self.action_dim) return action_logits @@ -198,6 +228,19 @@ class CategoricalDistribution(Distribution): class StateDependentNoiseDistribution(Distribution): + """ + Distribution class for using State Dependent Exploration (SDE). + It is used to create the noise exploration matrix and + compute the log probabilty of an action with that noise. + + :param action_dim: (int) Number of continuous actions + :param use_expln: (bool) Use `expln()` function instead of `exp()` to ensure + a positive standard deviation (cf paper). It allows to keep variance + above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. + :param squash_output: (bool) Whether to squash the output using a tanh function, + this allows to ensure boundaries. + :param epsilon: (float) small value to avoid NaN due to numerical imprecision. + """ def __init__(self, action_dim, use_expln=False, squash_output=False, epsilon=1e-6): super(StateDependentNoiseDistribution, self).__init__() @@ -215,6 +258,13 @@ class StateDependentNoiseDistribution(Distribution): self.bijector = None def get_std(self, log_std): + """ + Get the standard deviation from the learned parameter + (log of it by default). This ensures that the std is positive. + + :param log_std: (th.Tensor) + :return: (th.Tensor) + """ if self.use_expln: # From SDE paper, it allows to keep variance # above zero and prevent it from growing too fast @@ -223,19 +273,44 @@ class StateDependentNoiseDistribution(Distribution): else: return th.log(log_std + 1.0) + 1.0 else: + # Use normal exponential return th.exp(log_std) def sample_weights(self, log_std): + """ + Sample weights for the noise exploration matrix, + using a centered gaussian distribution. + + :param log_std: (th.Tensor) + """ + # TODO: reduce the number of learned dimensions (cf TD3) self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std)) self.exploration_mat = self.weights_dist.rsample() def proba_distribution_net(self, latent_dim, log_std_init=0.0): + """ + Create the layers and parameter that represent the distribution: + one output will be the deterministic action, the other parameter will be the + standard deviation of the distribution that control the weights of the noise matrix. + + :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer) + :param log_std_init: (float) Initial value for the log standard deviation + :return: (nn.Linear, nn.Parameter) + """ mean_actions = nn.Linear(latent_dim, self.action_dim) log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init) self.sample_weights(log_std) return mean_actions, log_std def proba_distribution(self, mean_actions, log_std, latent_pi, deterministic=False): + """ + Create and sample for the distribution given its parameters (mean, std) + + :param mean_actions: (th.Tensor) + :param log_std: (th.Tensor) + :param deterministic: (bool) + :return: (th.Tensor) + """ variance = th.mm(latent_pi.detach() ** 2, self.get_std(log_std) ** 2) self.distribution = Normal(mean_actions, th.sqrt(variance)) @@ -287,6 +362,13 @@ class StateDependentNoiseDistribution(Distribution): class TanhBijector(object): + """ + Bijective transformation of a probabilty distribution + using a squashing function (tanh) + TODO: use Pyro instead (https://pyro.ai/) + + :param epsilon: (float) small value to avoid NaN due to numerical imprecision. + """ def __init__(self, epsilon=1e-6): super(TanhBijector, self).__init__() self.epsilon = epsilon @@ -294,23 +376,27 @@ class TanhBijector(object): def forward(self, x): return th.tanh(x) - def inverse(self, action): + @staticmethod + def atanh(x): + """ + Inverse of Tanh + + Taken from pyro: https://github.com/pyro-ppl/pyro + 0.5 * torch.log((1 + x ) / (1 - x)) + """ + return 0.5 * (x.log1p() - (-x).log1p()) + + @staticmethod + def inverse(y): """ Inverse tanh. - From https://github.com/tensorflow/agents: - 0.99999997 is the maximum value such that atanh(x) is valid for both - float32 and float64 - - :param action: (th.Tensor) + :param y: (th.Tensor) :return: (th.Tensor) """ - # Inverse tanh - # Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x)) - # We use numpy to avoid numerical instability - # Note: Using numpy, we do not keep the gradient - clipped_action = np.clip(action.cpu().numpy(), -0.99999997, 0.99999997) - return th.from_numpy(np.arctanh(clipped_action)).to(action.device) + eps = th.finfo(y.dtype).eps + # Clip the action to avoid NaN + return TanhBijector.atanh(y.clamp(min=-1. + eps, max=1. - eps)) def log_prob_correction(self, x): # Squash correction (from original SAC implementation) From ef59a7e431293a5904197f51578b34c4db4e38b3 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 18 Nov 2019 15:11:19 +0100 Subject: [PATCH 19/19] Update version + add docstring --- setup.py | 2 +- torchy_baselines/common/base_class.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 89a3b57..f40a7da 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ setup(name='torchy_baselines', license="MIT", long_description="", long_description_content_type='text/markdown', - version="0.0.4", + version="0.0.5a", ) # python setup.py sdist diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index a6b9a41..89b3e98 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -283,6 +283,12 @@ class BaseRLModel(object): raise NotImplementedError() def set_random_seed(self, seed=None): + """ + Set the seed of the pseudo-random generators + (python, numpy, pytorch, gym, action_space) + + :param seed: (int) + """ if seed is None: return set_random_seed(seed, using_cuda=self.device == th.device('cuda'))