mirror of
https://github.com/saymrwulf/stable-baselines3.git
synced 2026-06-28 03:21:16 +00:00
resolving conflicts
# Conflicts: # torchy_baselines/a2c/a2c.py # torchy_baselines/ppo/ppo.py Added optimizer params test
This commit is contained in:
commit
c75582dfbe
10 changed files with 440 additions and 49 deletions
2
setup.py
2
setup.py
|
|
@ -34,7 +34,7 @@ setup(name='torchy_baselines',
|
|||
license="MIT",
|
||||
long_description="",
|
||||
long_description_content_type='text/markdown',
|
||||
version="0.0.4",
|
||||
version="0.0.5a",
|
||||
)
|
||||
|
||||
# python setup.py sdist
|
||||
|
|
|
|||
20
tests/test_distributions.py
Normal file
20
tests/test_distributions.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
import numpy as np
|
||||
import torch as th
|
||||
|
||||
from torchy_baselines.common.distributions import DiagGaussianDistribution, SquashedDiagGaussianDistribution,\
|
||||
CategoricalDistribution, TanhBijector
|
||||
|
||||
# TODO: more tests for the other distributions
|
||||
def test_bijector():
|
||||
"""
|
||||
Test TanhBijector
|
||||
"""
|
||||
actions = th.ones(5) * 2.0
|
||||
|
||||
bijector = TanhBijector()
|
||||
|
||||
squashed_actions = bijector.forward(actions)
|
||||
# Check that the boundaries are not violated
|
||||
assert th.max(th.abs(squashed_actions)) <= 1.0
|
||||
# Check the inverse method
|
||||
assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all()
|
||||
60
tests/test_sde.py
Normal file
60
tests/test_sde.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
import pytest
|
||||
|
||||
import gym
|
||||
import torch as th
|
||||
from torch.distributions import Normal
|
||||
|
||||
from torchy_baselines import A2C
|
||||
from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize
|
||||
from torchy_baselines.common.monitor import Monitor
|
||||
|
||||
|
||||
def test_state_dependent_exploration():
|
||||
"""
|
||||
Check that the gradient correspond to the expected one
|
||||
"""
|
||||
n_states = 2
|
||||
state_dim = 3
|
||||
# TODO: fix for action_dim > 1
|
||||
action_dim = 1
|
||||
sigma = th.ones(state_dim, 1, requires_grad=True)
|
||||
# Reduce the number of parameters
|
||||
# sigma_ = th.ones(state_dim, action_dim) * sigma_
|
||||
|
||||
# weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
|
||||
th.manual_seed(2)
|
||||
weights_dist = Normal(th.zeros_like(sigma), sigma)
|
||||
|
||||
weights = weights_dist.rsample()
|
||||
state = th.rand(n_states, state_dim)
|
||||
mu = th.ones(action_dim)
|
||||
# print(weights.shape, state.shape)
|
||||
noise = th.mm(state, weights)
|
||||
|
||||
variance = th.mm(state ** 2, sigma ** 2)
|
||||
action_dist = Normal(mu, th.sqrt(variance))
|
||||
|
||||
loss = action_dist.log_prob((mu + noise).detach()).mean()
|
||||
loss.backward()
|
||||
|
||||
# From Rueckstiess paper
|
||||
grad = th.zeros_like(sigma)
|
||||
for j in range(action_dim):
|
||||
for i in range(state_dim):
|
||||
a = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j])
|
||||
grad[i, j] = a.mean()
|
||||
|
||||
# sigma.grad should be equal to grad
|
||||
assert sigma.grad.allclose(grad)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_class", [A2C])
|
||||
def test_state_dependent_noise(model_class):
|
||||
env_id = 'MountainCarContinuous-v0'
|
||||
|
||||
env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True)
|
||||
eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False)
|
||||
|
||||
model = model_class('MlpPolicy', env, n_steps=200, use_sde=True, ent_coef=0.00, verbose=1, learning_rate=3e-4,
|
||||
policy_kwargs=dict(log_std_init=0.0, ortho_init=False), seed=None)
|
||||
model.learn(total_timesteps=int(1000), log_interval=5, eval_freq=500, eval_env=eval_env)
|
||||
|
|
@ -5,6 +5,7 @@ import torch.nn.functional as F
|
|||
from torchy_baselines.common.utils import explained_variance
|
||||
from torchy_baselines.ppo.ppo import PPO
|
||||
from torchy_baselines.ppo.policies import PPOPolicy
|
||||
from torchy_baselines.common import logger
|
||||
|
||||
|
||||
class A2C(PPO):
|
||||
|
|
@ -30,6 +31,8 @@ class A2C(PPO):
|
|||
:param rms_prop_eps: (float) RMSProp epsilon. It stabilizes square root computation in denominator
|
||||
of RMSProp update
|
||||
:param use_rms_prop: (bool) Whether to use RMSprop (default) or Adam as optimizer
|
||||
:param use_sde: (bool) Whether to use State Dependent Exploration (SDE)
|
||||
instead of action noise exploration (default: False)
|
||||
:param normalize_advantage: (bool) Whether to normalize or not the advantage
|
||||
:param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
|
||||
:param create_eval_env: (bool) Whether to create a second environment that will be
|
||||
|
|
@ -45,7 +48,7 @@ class A2C(PPO):
|
|||
def __init__(self, policy, env, learning_rate=7e-4,
|
||||
n_steps=5, gamma=0.99, gae_lambda=1.0,
|
||||
ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5,
|
||||
rms_prop_eps=1e-5, use_rms_prop=True,
|
||||
rms_prop_eps=1e-5, use_rms_prop=True, use_sde=False,
|
||||
normalize_advantage=False, tensorboard_log=None, create_eval_env=False,
|
||||
policy_kwargs=None, verbose=0, seed=0, device='auto',
|
||||
_init_setup_model=True):
|
||||
|
|
@ -53,7 +56,7 @@ class A2C(PPO):
|
|||
super(A2C, self).__init__(policy, env, learning_rate=learning_rate,
|
||||
n_steps=n_steps, batch_size=None, n_epochs=1,
|
||||
gamma=gamma, gae_lambda=gae_lambda, ent_coef=ent_coef,
|
||||
vf_coef=vf_coef, max_grad_norm=max_grad_norm,
|
||||
vf_coef=vf_coef, max_grad_norm=max_grad_norm, use_sde=use_sde,
|
||||
tensorboard_log=tensorboard_log, policy_kwargs=policy_kwargs,
|
||||
verbose=verbose, device=device, create_eval_env=create_eval_env,
|
||||
seed=seed, _init_setup_model=False)
|
||||
|
|
@ -73,7 +76,6 @@ class A2C(PPO):
|
|||
eps=self.rms_prop_eps, weight_decay=0)
|
||||
|
||||
def train(self, gradient_steps, batch_size=None):
|
||||
|
||||
# Update optimizer learning rate
|
||||
self._update_learning_rate(self.policy.optimizer)
|
||||
# A2C with gradient_steps > 1 does not make sense
|
||||
|
|
@ -113,10 +115,16 @@ class A2C(PPO):
|
|||
# Clip grad norm
|
||||
th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
|
||||
self.policy.optimizer.step()
|
||||
# approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy())
|
||||
|
||||
# print(explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),
|
||||
# self.rollout_buffer.values.flatten().cpu().numpy()))
|
||||
explained_var = explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),
|
||||
self.rollout_buffer.values.flatten().cpu().numpy())
|
||||
|
||||
logger.logkv("explained_variance", explained_var)
|
||||
logger.logkv("entropy", entropy.mean().item())
|
||||
logger.logkv("policy_loss", policy_loss.item())
|
||||
logger.logkv("value_loss", value_loss.item())
|
||||
if hasattr(self.policy, 'log_std'):
|
||||
logger.logkv("std", th.exp(self.policy.log_std).mean().item())
|
||||
|
||||
def learn(self, total_timesteps, callback=None, log_interval=100,
|
||||
eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="A2C", reset_num_timesteps=True):
|
||||
|
|
|
|||
|
|
@ -355,7 +355,15 @@ class BaseRLModel(object):
|
|||
|
||||
return data, params, opt_params
|
||||
|
||||
def set_random_seed(self, seed=0):
|
||||
def set_random_seed(self, seed=None):
|
||||
"""
|
||||
Set the seed of the pseudo-random generators
|
||||
(python, numpy, pytorch, gym, action_space)
|
||||
|
||||
:param seed: (int)
|
||||
"""
|
||||
if seed is None:
|
||||
return
|
||||
set_random_seed(seed, using_cuda=self.device == th.device('cuda'))
|
||||
self.action_space.seed(seed)
|
||||
if self.env is not None:
|
||||
|
|
|
|||
|
|
@ -113,22 +113,42 @@ class RolloutBuffer(BaseBuffer):
|
|||
self.generator_ready = False
|
||||
super(RolloutBuffer, self).reset()
|
||||
|
||||
def compute_returns_and_advantage(self, last_value, dones=False):
|
||||
def compute_returns_and_advantage(self, last_value, dones=False, use_gae=True):
|
||||
"""
|
||||
From PPO2
|
||||
From Stable-Baselines PPO2
|
||||
:param last_value: (th.Tensor)
|
||||
:param dones: ([bool])
|
||||
:param use_gae: (bool) Whether to use Generalized Advantage Estimation
|
||||
or normal advantage for advantage computation.
|
||||
"""
|
||||
last_gae_lam = 0
|
||||
for step in reversed(range(self.buffer_size)):
|
||||
if step == self.buffer_size - 1:
|
||||
next_non_terminal = th.FloatTensor(1.0 - dones)
|
||||
next_value = last_value.clone().cpu().flatten()
|
||||
else:
|
||||
next_non_terminal = 1.0 - self.dones[step + 1]
|
||||
next_value = self.values[step + 1]
|
||||
delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
|
||||
last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
|
||||
self.advantages[step] = last_gae_lam
|
||||
self.returns = self.advantages + self.values
|
||||
if use_gae:
|
||||
last_gae_lam = 0
|
||||
for step in reversed(range(self.buffer_size)):
|
||||
if step == self.buffer_size - 1:
|
||||
next_non_terminal = th.FloatTensor(1.0 - dones)
|
||||
next_value = last_value.clone().cpu().flatten()
|
||||
else:
|
||||
next_non_terminal = 1.0 - self.dones[step + 1]
|
||||
next_value = self.values[step + 1]
|
||||
delta = self.rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
|
||||
last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
|
||||
self.advantages[step] = last_gae_lam
|
||||
self.returns = self.advantages + self.values
|
||||
else:
|
||||
# Discounted return with value bootstrap
|
||||
# Note: this is equivalent to GAE computation
|
||||
# with gae_lambda = 1.0
|
||||
last_return = 0.0
|
||||
for step in reversed(range(self.buffer_size)):
|
||||
if step == self.buffer_size - 1:
|
||||
next_non_terminal = th.FloatTensor(1.0 - dones)
|
||||
next_value = last_value.clone().cpu().flatten()
|
||||
last_return = self.rewards[step] + next_non_terminal * next_value
|
||||
else:
|
||||
next_non_terminal = 1.0 - self.dones[step + 1]
|
||||
last_return = self.rewards[step] + self.gamma * last_return * next_non_terminal
|
||||
self.returns[step] = last_return
|
||||
self.advantages = self.returns - self.values
|
||||
|
||||
def add(self, obs, action, reward, done, value, log_prob):
|
||||
if len(log_prob.shape) == 0:
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import numpy as np
|
|||
import torch as th
|
||||
import torch.nn as nn
|
||||
from torch.distributions import Normal, Categorical
|
||||
import torch.nn.functional as F
|
||||
from gym import spaces
|
||||
|
||||
|
||||
|
|
@ -45,6 +46,12 @@ class Distribution(object):
|
|||
|
||||
|
||||
class DiagGaussianDistribution(Distribution):
|
||||
"""
|
||||
Gaussian distribution with diagonal covariance matrix,
|
||||
for continuous actions.
|
||||
|
||||
:param action_dim: (int) Number of continuous actions
|
||||
"""
|
||||
def __init__(self, action_dim):
|
||||
super(DiagGaussianDistribution, self).__init__()
|
||||
self.distribution = None
|
||||
|
|
@ -53,12 +60,29 @@ class DiagGaussianDistribution(Distribution):
|
|||
self.log_std = None
|
||||
|
||||
def proba_distribution_net(self, latent_dim, log_std_init=0.0):
|
||||
"""
|
||||
Create the layers and parameter that represent the distribution:
|
||||
one output will be the mean of the gaussian, the other parameter will be the
|
||||
standard deviation (log std in fact to allow negative values)
|
||||
|
||||
:param latent_dim: (int) Dimension og the last layer of the policy (before the action layer)
|
||||
:param log_std_init: (float) Initial value for the log standard deviation
|
||||
:return: (nn.Linear, nn.Parameter)
|
||||
"""
|
||||
mean_actions = nn.Linear(latent_dim, self.action_dim)
|
||||
# TODO: allow action dependent std
|
||||
log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init)
|
||||
return mean_actions, log_std
|
||||
|
||||
def proba_distribution(self, mean_actions, log_std, deterministic=False):
|
||||
"""
|
||||
Create and sample for the distribution given its parameters (mean, std)
|
||||
|
||||
:param mean_actions: (th.Tensor)
|
||||
:param log_std: (th.Tensor)
|
||||
:param deterministic: (bool)
|
||||
:return: (th.Tensor)
|
||||
"""
|
||||
action_std = th.ones_like(mean_actions) * log_std.exp()
|
||||
self.distribution = Normal(mean_actions, action_std)
|
||||
if deterministic:
|
||||
|
|
@ -77,11 +101,27 @@ class DiagGaussianDistribution(Distribution):
|
|||
return self.distribution.entropy()
|
||||
|
||||
def log_prob_from_params(self, mean_actions, log_std):
|
||||
"""
|
||||
Compute the log probabilty of taking an action
|
||||
given the distribution parameters.
|
||||
|
||||
:param mean_actions: (th.Tensor)
|
||||
:param log_std: (th.Tensor)
|
||||
:return: (th.Tensor, th.Tensor)
|
||||
"""
|
||||
action, _ = self.proba_distribution(mean_actions, log_std)
|
||||
log_prob = self.log_prob(action)
|
||||
return action, log_prob
|
||||
|
||||
def log_prob(self, action):
|
||||
"""
|
||||
Get the log probabilty of an action given a distribution.
|
||||
Note that you must call `proba_distribution()` method
|
||||
before.
|
||||
|
||||
:param action: (th.Tensor)
|
||||
:return: (th.Tensor)
|
||||
"""
|
||||
log_prob = self.distribution.log_prob(action)
|
||||
if len(log_prob.shape) > 1:
|
||||
log_prob = log_prob.sum(axis=1)
|
||||
|
|
@ -91,6 +131,13 @@ class DiagGaussianDistribution(Distribution):
|
|||
|
||||
|
||||
class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
|
||||
"""
|
||||
Gaussian distribution with diagonal covariance matrix,
|
||||
followed by a squashing function (tanh) to ensure bounds.
|
||||
|
||||
:param action_dim: (int) Number of continuous actions
|
||||
:param epsilon: (float) small value to avoid NaN due to numerical imprecision.
|
||||
"""
|
||||
def __init__(self, action_dim, epsilon=1e-6):
|
||||
super(SquashedDiagGaussianDistribution, self).__init__(action_dim)
|
||||
# Avoid NaN (prevents division by zero or log of zero)
|
||||
|
|
@ -118,25 +165,40 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
|
|||
|
||||
def log_prob(self, action, gaussian_action=None):
|
||||
# Inverse tanh
|
||||
# Naive implementation (not stable): 0.5 * torch.log((1 + x ) / (1 - x))
|
||||
# Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x))
|
||||
# We use numpy to avoid numerical instability
|
||||
if gaussian_action is None:
|
||||
gaussian_action = th.from_numpy(np.arctanh(action.cpu().numpy())).to(action.device)
|
||||
# It will be clipped to avoid NaN when inversing tanh
|
||||
gaussian_action = TanhBijector.inverse(action)
|
||||
|
||||
# Log likelihood for a gaussian distribution
|
||||
log_prob = super(SquashedDiagGaussianDistribution, self).log_prob(gaussian_action)
|
||||
# Squash correction (from original SAC implementation)
|
||||
# this comes from the fact that tanh is bijective and differentiable
|
||||
log_prob -= th.sum(th.log(1 - action ** 2 + self.epsilon), dim=1)
|
||||
return log_prob
|
||||
|
||||
|
||||
class CategoricalDistribution(Distribution):
|
||||
"""
|
||||
Categorical distribution for discrete actions.
|
||||
|
||||
:param action_dim: (int) Number of discrete actions
|
||||
"""
|
||||
def __init__(self, action_dim):
|
||||
super(CategoricalDistribution, self).__init__()
|
||||
self.distribution = None
|
||||
self.action_dim = action_dim
|
||||
|
||||
def proba_distribution_net(self, latent_dim):
|
||||
"""
|
||||
Create the layer that represents the distribution:
|
||||
it will be the logits of the Categorical distribution.
|
||||
You can then get probabilties using a softmax.
|
||||
|
||||
:param latent_dim: (int) Dimension og the last layer of the policy (before the action layer)
|
||||
:return: (nn.Linear)
|
||||
"""
|
||||
action_logits = nn.Linear(latent_dim, self.action_dim)
|
||||
return action_logits
|
||||
|
||||
|
|
@ -167,15 +229,195 @@ class CategoricalDistribution(Distribution):
|
|||
return log_prob
|
||||
|
||||
|
||||
def make_proba_distribution(action_space):
|
||||
class StateDependentNoiseDistribution(Distribution):
|
||||
"""
|
||||
Distribution class for using State Dependent Exploration (SDE).
|
||||
It is used to create the noise exploration matrix and
|
||||
compute the log probabilty of an action with that noise.
|
||||
|
||||
:param action_dim: (int) Number of continuous actions
|
||||
:param use_expln: (bool) Use `expln()` function instead of `exp()` to ensure
|
||||
a positive standard deviation (cf paper). It allows to keep variance
|
||||
above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
|
||||
:param squash_output: (bool) Whether to squash the output using a tanh function,
|
||||
this allows to ensure boundaries.
|
||||
:param epsilon: (float) small value to avoid NaN due to numerical imprecision.
|
||||
"""
|
||||
def __init__(self, action_dim, use_expln=False,
|
||||
squash_output=False, epsilon=1e-6):
|
||||
super(StateDependentNoiseDistribution, self).__init__()
|
||||
self.distribution = None
|
||||
self.action_dim = action_dim
|
||||
self.mean_actions = None
|
||||
self.log_std = None
|
||||
self.weights_dist = None
|
||||
self.exploration_mat = None
|
||||
self.use_expln = use_expln
|
||||
if squash_output:
|
||||
print("== Using TanhBijector ===")
|
||||
self.bijector = TanhBijector(epsilon)
|
||||
else:
|
||||
self.bijector = None
|
||||
|
||||
def get_std(self, log_std):
|
||||
"""
|
||||
Get the standard deviation from the learned parameter
|
||||
(log of it by default). This ensures that the std is positive.
|
||||
|
||||
:param log_std: (th.Tensor)
|
||||
:return: (th.Tensor)
|
||||
"""
|
||||
if self.use_expln:
|
||||
# From SDE paper, it allows to keep variance
|
||||
# above zero and prevent it from growing too fast
|
||||
if log_std <= 0:
|
||||
return th.exp(log_std)
|
||||
else:
|
||||
return th.log(log_std + 1.0) + 1.0
|
||||
else:
|
||||
# Use normal exponential
|
||||
return th.exp(log_std)
|
||||
|
||||
def sample_weights(self, log_std):
|
||||
"""
|
||||
Sample weights for the noise exploration matrix,
|
||||
using a centered gaussian distribution.
|
||||
|
||||
:param log_std: (th.Tensor)
|
||||
"""
|
||||
# TODO: reduce the number of learned dimensions (cf TD3)
|
||||
self.weights_dist = Normal(th.zeros_like(log_std), self.get_std(log_std))
|
||||
self.exploration_mat = self.weights_dist.rsample()
|
||||
|
||||
def proba_distribution_net(self, latent_dim, log_std_init=0.0):
|
||||
"""
|
||||
Create the layers and parameter that represent the distribution:
|
||||
one output will be the deterministic action, the other parameter will be the
|
||||
standard deviation of the distribution that control the weights of the noise matrix.
|
||||
|
||||
:param latent_dim: (int) Dimension og the last layer of the policy (before the action layer)
|
||||
:param log_std_init: (float) Initial value for the log standard deviation
|
||||
:return: (nn.Linear, nn.Parameter)
|
||||
"""
|
||||
mean_actions = nn.Linear(latent_dim, self.action_dim)
|
||||
log_std = nn.Parameter(th.ones(latent_dim, self.action_dim) * log_std_init)
|
||||
self.sample_weights(log_std)
|
||||
return mean_actions, log_std
|
||||
|
||||
def proba_distribution(self, mean_actions, log_std, latent_pi, deterministic=False):
|
||||
"""
|
||||
Create and sample for the distribution given its parameters (mean, std)
|
||||
|
||||
:param mean_actions: (th.Tensor)
|
||||
:param log_std: (th.Tensor)
|
||||
:param deterministic: (bool)
|
||||
:return: (th.Tensor)
|
||||
"""
|
||||
variance = th.mm(latent_pi.detach() ** 2, self.get_std(log_std) ** 2)
|
||||
self.distribution = Normal(mean_actions, th.sqrt(variance))
|
||||
|
||||
if deterministic:
|
||||
action = self.mode()
|
||||
else:
|
||||
action = self.sample(latent_pi)
|
||||
return action, self
|
||||
|
||||
def mode(self):
|
||||
action = self.distribution.mean
|
||||
if self.bijector is not None:
|
||||
return self.bijector.forward(action)
|
||||
return action
|
||||
|
||||
def sample(self, latent_pi):
|
||||
noise = th.mm(latent_pi.detach(), self.exploration_mat)
|
||||
action = self.distribution.mean + noise
|
||||
if self.bijector is not None:
|
||||
return self.bijector.forward(action)
|
||||
return action
|
||||
|
||||
def entropy(self):
|
||||
# TODO: account for the squashing?
|
||||
return self.distribution.entropy()
|
||||
|
||||
def log_prob_from_params(self, mean_actions, log_std, latent_pi):
|
||||
action, _ = self.proba_distribution(mean_actions, log_std, latent_pi)
|
||||
log_prob = self.log_prob(action)
|
||||
return action, log_prob
|
||||
|
||||
def log_prob(self, action):
|
||||
if self.bijector is not None:
|
||||
gaussian_action = self.bijector.inverse(action)
|
||||
else:
|
||||
gaussian_action = action
|
||||
# log likelihood for a gaussian
|
||||
log_prob = self.distribution.log_prob(gaussian_action)
|
||||
|
||||
if len(log_prob.shape) > 1:
|
||||
log_prob = log_prob.sum(axis=1)
|
||||
else:
|
||||
log_prob = log_prob.sum()
|
||||
|
||||
if self.bijector is not None:
|
||||
# Squash correction (from original SAC implementation)
|
||||
log_prob -= th.sum(self.bijector.log_prob_correction(gaussian_action), dim=1)
|
||||
return log_prob
|
||||
|
||||
|
||||
class TanhBijector(object):
|
||||
"""
|
||||
Bijective transformation of a probabilty distribution
|
||||
using a squashing function (tanh)
|
||||
TODO: use Pyro instead (https://pyro.ai/)
|
||||
|
||||
:param epsilon: (float) small value to avoid NaN due to numerical imprecision.
|
||||
"""
|
||||
def __init__(self, epsilon=1e-6):
|
||||
super(TanhBijector, self).__init__()
|
||||
self.epsilon = epsilon
|
||||
|
||||
def forward(self, x):
|
||||
return th.tanh(x)
|
||||
|
||||
@staticmethod
|
||||
def atanh(x):
|
||||
"""
|
||||
Inverse of Tanh
|
||||
|
||||
Taken from pyro: https://github.com/pyro-ppl/pyro
|
||||
0.5 * torch.log((1 + x ) / (1 - x))
|
||||
"""
|
||||
return 0.5 * (x.log1p() - (-x).log1p())
|
||||
|
||||
@staticmethod
|
||||
def inverse(y):
|
||||
"""
|
||||
Inverse tanh.
|
||||
|
||||
:param y: (th.Tensor)
|
||||
:return: (th.Tensor)
|
||||
"""
|
||||
eps = th.finfo(y.dtype).eps
|
||||
# Clip the action to avoid NaN
|
||||
return TanhBijector.atanh(y.clamp(min=-1. + eps, max=1. - eps))
|
||||
|
||||
def log_prob_correction(self, x):
|
||||
# Squash correction (from original SAC implementation)
|
||||
return th.log(1 - th.tanh(x) ** 2 + self.epsilon)
|
||||
|
||||
|
||||
def make_proba_distribution(action_space, use_sde=False):
|
||||
"""
|
||||
Return an instance of Distribution for the correct type of action space
|
||||
|
||||
:param action_space: (Gym Space) the input action space
|
||||
:param use_sde: (bool) Force the use of StateDependentNoiseDistribution
|
||||
instead of DiagGaussianDistribution
|
||||
:return: (Distribution) the approriate Distribution object
|
||||
"""
|
||||
if isinstance(action_space, spaces.Box):
|
||||
assert len(action_space.shape) == 1, "Error: the action space must be a vector"
|
||||
if use_sde:
|
||||
return StateDependentNoiseDistribution(action_space.shape[0])
|
||||
return DiagGaussianDistribution(action_space.shape[0])
|
||||
elif isinstance(action_space, spaces.Discrete):
|
||||
return CategoricalDistribution(action_space.n)
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ class Monitor(Wrapper):
|
|||
EXT = "monitor.csv"
|
||||
file_handler = None
|
||||
|
||||
def __init__(self, env, filename, allow_early_resets=True, reset_keywords=(), info_keywords=()):
|
||||
def __init__(self, env, filename=None, allow_early_resets=True, reset_keywords=(), info_keywords=()):
|
||||
"""
|
||||
A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data.
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ import torch.nn as nn
|
|||
import numpy as np
|
||||
|
||||
from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp
|
||||
from torchy_baselines.common.distributions import make_proba_distribution, DiagGaussianDistribution, CategoricalDistribution
|
||||
from torchy_baselines.common.distributions import make_proba_distribution,\
|
||||
DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution
|
||||
|
||||
|
||||
class MlpExtractor(nn.Module):
|
||||
|
|
@ -101,7 +102,8 @@ class MlpExtractor(nn.Module):
|
|||
class PPOPolicy(BasePolicy):
|
||||
def __init__(self, observation_space, action_space,
|
||||
learning_rate, net_arch=None, device='cpu',
|
||||
activation_fn=nn.Tanh, adam_epsilon=1e-5, ortho_init=True):
|
||||
activation_fn=nn.Tanh, adam_epsilon=1e-5,
|
||||
ortho_init=True, use_sde=False, log_std_init=0.0):
|
||||
super(PPOPolicy, self).__init__(observation_space, action_space, device)
|
||||
self.obs_dim = self.observation_space.shape[0]
|
||||
if net_arch is None:
|
||||
|
|
@ -118,21 +120,25 @@ class PPOPolicy(BasePolicy):
|
|||
}
|
||||
self.shared_net = None
|
||||
self.pi_net, self.vf_net = None, None
|
||||
# Action distribution
|
||||
self.action_dist = make_proba_distribution(action_space)
|
||||
# In the future, feature_extractor will be replaced with a CNN
|
||||
self.features_extractor = nn.Flatten()
|
||||
self.features_dim = self.obs_dim
|
||||
self.log_std_init = log_std_init
|
||||
# Action distribution
|
||||
self.action_dist = make_proba_distribution(action_space, use_sde=use_sde)
|
||||
|
||||
self._build(learning_rate)
|
||||
|
||||
def reset_noise_net(self):
|
||||
self.action_dist.sample_weights(self.log_std)
|
||||
|
||||
def _build(self, learning_rate):
|
||||
self.mlp_extractor = MlpExtractor(self.features_dim, net_arch=self.net_arch,
|
||||
activation_fn=self.activation_fn, device=self.device)
|
||||
|
||||
# self.action_net = nn.Linear(self.net_arch[-1], self.action_dim)
|
||||
# self.log_std = nn.Parameter(th.zeros(self.action_dim))
|
||||
if isinstance(self.action_dist, DiagGaussianDistribution):
|
||||
self.action_net, self.log_std = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi)
|
||||
if isinstance(self.action_dist, (DiagGaussianDistribution, StateDependentNoiseDistribution)):
|
||||
self.action_net, self.log_std = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi,
|
||||
log_std_init=self.log_std_init)
|
||||
elif isinstance(self.action_dist, CategoricalDistribution):
|
||||
self.action_net = self.action_dist.proba_distribution_net(latent_dim=self.mlp_extractor.latent_dim_pi)
|
||||
|
||||
|
|
@ -162,21 +168,26 @@ class PPOPolicy(BasePolicy):
|
|||
def _get_latent(self, obs):
|
||||
return self.mlp_extractor(self.features_extractor(obs))
|
||||
|
||||
def _get_action_dist_from_latent(self, latent, deterministic=False):
|
||||
mean_actions = self.action_net(latent)
|
||||
def _get_action_dist_from_latent(self, latent_pi, deterministic=False):
|
||||
mean_actions = self.action_net(latent_pi)
|
||||
|
||||
if isinstance(self.action_dist, DiagGaussianDistribution):
|
||||
return self.action_dist.proba_distribution(mean_actions, self.log_std, deterministic=deterministic)
|
||||
|
||||
elif isinstance(self.action_dist, CategoricalDistribution):
|
||||
return self.action_dist.proba_distribution(mean_actions, deterministic=deterministic)
|
||||
|
||||
elif isinstance(self.action_dist, StateDependentNoiseDistribution):
|
||||
return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_pi, deterministic=deterministic)
|
||||
|
||||
def actor_forward(self, obs, deterministic=False):
|
||||
latent_pi, _ = self._get_latent(obs)
|
||||
action, _ = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
|
||||
return action.detach().cpu().numpy()
|
||||
|
||||
def get_policy_stats(self, obs, action):
|
||||
def get_policy_stats(self, obs, action, deterministic=False):
|
||||
latent_pi, latent_vf = self._get_latent(obs)
|
||||
_, action_distribution = self._get_action_dist_from_latent(latent_pi)
|
||||
_, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
|
||||
log_prob = action_distribution.log_prob(action)
|
||||
value = self.value_net(latent_vf)
|
||||
return value, log_prob, action_distribution.entropy()
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ from torchy_baselines.common.base_class import BaseRLModel
|
|||
from torchy_baselines.common.evaluation import evaluate_policy
|
||||
from torchy_baselines.common.buffers import RolloutBuffer
|
||||
from torchy_baselines.common.utils import explained_variance, get_schedule_fn
|
||||
from torchy_baselines.common.vec_env import VecNormalize
|
||||
from torchy_baselines.common.vec_env import VecNormalize, VecEnvWrapper
|
||||
from torchy_baselines.common import logger
|
||||
from torchy_baselines.ppo.policies import PPOPolicy
|
||||
|
||||
|
|
@ -53,6 +53,8 @@ class PPO(BaseRLModel):
|
|||
:param ent_coef: (float) Entropy coefficient for the loss calculation
|
||||
:param vf_coef: (float) Value function coefficient for the loss calculation
|
||||
:param max_grad_norm: (float) The maximum value for the gradient clipping
|
||||
:param use_sde: (bool) Whether to use State Dependent Exploration (SDE)
|
||||
instead of action noise exploration (default: False)
|
||||
:param target_kl: (float) Limit the KL divergence between updates,
|
||||
because the clipping is not enough to prevent large update
|
||||
see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
|
||||
|
|
@ -71,7 +73,7 @@ class PPO(BaseRLModel):
|
|||
def __init__(self, policy, env, learning_rate=3e-4,
|
||||
n_steps=2048, batch_size=64, n_epochs=10,
|
||||
gamma=0.99, gae_lambda=0.95, clip_range=0.2, clip_range_vf=None,
|
||||
ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5,
|
||||
ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, use_sde=False,
|
||||
target_kl=None, tensorboard_log=None, create_eval_env=False,
|
||||
policy_kwargs=None, verbose=0, seed=0, device='auto',
|
||||
_init_setup_model=True):
|
||||
|
|
@ -95,6 +97,7 @@ class PPO(BaseRLModel):
|
|||
self.target_kl = target_kl
|
||||
self.tensorboard_log = tensorboard_log
|
||||
self.tb_writer = None
|
||||
self.use_sde = use_sde
|
||||
|
||||
if _init_setup_model:
|
||||
self._setup_model()
|
||||
|
|
@ -117,19 +120,20 @@ class PPO(BaseRLModel):
|
|||
self.rollout_buffer = RolloutBuffer(self.n_steps, state_dim, action_dim, self.device,
|
||||
gamma=self.gamma, gae_lambda=self.gae_lambda, n_envs=self.n_envs)
|
||||
self.policy = self.policy(self.observation_space, self.action_space,
|
||||
self.learning_rate, device=self.device, **self.policy_kwargs)
|
||||
self.learning_rate, use_sde=self.use_sde, device=self.device,
|
||||
**self.policy_kwargs)
|
||||
self.policy = self.policy.to(self.device)
|
||||
|
||||
self.clip_range = get_schedule_fn(self.clip_range)
|
||||
if self.clip_range_vf is not None:
|
||||
self.clip_range_vf = get_schedule_fn(self.clip_range_vf)
|
||||
|
||||
def select_action(self, observation,deterministic=False):
|
||||
def select_action(self, observation, deterministic=False):
|
||||
# Normally not needed
|
||||
observation = np.array(observation)
|
||||
with th.no_grad():
|
||||
observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
|
||||
return self.policy.actor_forward(observation, deterministic)
|
||||
return self.policy.actor_forward(observation, deterministic=deterministic)
|
||||
|
||||
def predict(self, observation, state=None, mask=None, deterministic=False):
|
||||
"""
|
||||
|
|
@ -141,7 +145,7 @@ class PPO(BaseRLModel):
|
|||
:param deterministic: (bool) Whether or not to return deterministic actions.
|
||||
:return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
|
||||
"""
|
||||
clipped_actions = self.select_action(observation)
|
||||
clipped_actions = self.select_action(observation, deterministic=deterministic)
|
||||
if isinstance(self.action_space, gym.spaces.Box):
|
||||
clipped_actions = np.clip(clipped_actions, self.action_space.low, self.action_space.high)
|
||||
return clipped_actions
|
||||
|
|
@ -151,6 +155,10 @@ class PPO(BaseRLModel):
|
|||
|
||||
n_steps = 0
|
||||
rollout_buffer.reset()
|
||||
# Sample new weights for the state dependent exploration
|
||||
# TODO: ensure episodic setting?
|
||||
if self.use_sde:
|
||||
self.policy.reset_noise_net()
|
||||
|
||||
while n_steps < n_rollout_steps:
|
||||
with th.no_grad():
|
||||
|
|
@ -219,6 +227,7 @@ class PPO(BaseRLModel):
|
|||
# Value loss using the TD(gae_lambda) target
|
||||
value_loss = F.mse_loss(return_batch, values_pred)
|
||||
|
||||
|
||||
# Entropy loss favor exploration
|
||||
entropy_loss = -th.mean(entropy)
|
||||
|
||||
|
|
@ -233,11 +242,19 @@ class PPO(BaseRLModel):
|
|||
approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy())
|
||||
|
||||
if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl:
|
||||
print("Early stopping at step {} due to reaching max kl: {:.2f}".format(gradient_step, np.mean(approx_kl_divs)))
|
||||
print("Early stopping at step {} due to reaching max kl: {:.2f}".format(it, np.mean(approx_kl_divs)))
|
||||
break
|
||||
|
||||
# print(explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),
|
||||
# self.rollout_buffer.values.flatten().cpu().numpy()))
|
||||
explained_var = explained_variance(self.rollout_buffer.returns.flatten().cpu().numpy(),
|
||||
self.rollout_buffer.values.flatten().cpu().numpy())
|
||||
|
||||
logger.logkv("explained_variance", explained_var)
|
||||
# TODO: gather stats for the entropy and other losses?
|
||||
logger.logkv("entropy", entropy.mean().item())
|
||||
logger.logkv("policy_loss", policy_loss.item())
|
||||
logger.logkv("value_loss", value_loss.item())
|
||||
if hasattr(self.policy, 'log_std'):
|
||||
logger.logkv("std", th.exp(self.policy.log_std).mean().item())
|
||||
|
||||
def learn(self, total_timesteps, callback=None, log_interval=1,
|
||||
eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True):
|
||||
|
|
@ -278,9 +295,14 @@ class PPO(BaseRLModel):
|
|||
# Evaluate agent
|
||||
if 0 < eval_freq <= timesteps_since_eval and eval_env is not None:
|
||||
timesteps_since_eval %= eval_freq
|
||||
# TODO: move that to the base class
|
||||
# Sync eval env and train env when using VecNormalize
|
||||
if isinstance(self.env, VecNormalize):
|
||||
eval_env.obs_rms = deepcopy(self.env.obs_rms)
|
||||
env_tmp, eval_env_tmp = self.env, eval_env
|
||||
while isinstance(env_tmp, VecEnvWrapper):
|
||||
if isinstance(env_tmp, VecNormalize):
|
||||
eval_env_tmp.obs_rms = deepcopy(env_tmp.obs_rms)
|
||||
env_tmp = env_tmp.venv
|
||||
eval_env_tmp.venv
|
||||
mean_reward, _ = evaluate_policy(self, eval_env, n_eval_episodes)
|
||||
if self.tb_writer is not None:
|
||||
self.tb_writer.add_scalar('Eval/reward', mean_reward, self.num_timesteps)
|
||||
|
|
|
|||
Loading…
Reference in a new issue